diff --git a/clang/include/clang/Basic/BuiltinsRISCV.def b/clang/include/clang/Basic/BuiltinsRISCV.def --- a/clang/include/clang/Basic/BuiltinsRISCV.def +++ b/clang/include/clang/Basic/BuiltinsRISCV.def @@ -79,5 +79,9 @@ TARGET_BUILTIN(__builtin_riscv_sm3p0, "LiLi", "nc", "zksh") TARGET_BUILTIN(__builtin_riscv_sm3p1, "LiLi", "nc", "zksh") +// Zihintntl extension +TARGET_BUILTIN(__builtin_riscv_ntl_load, "v.", "t", "experimental-zihintntl") +TARGET_BUILTIN(__builtin_riscv_ntl_store, "v.", "t", "experimental-zihintntl") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19762,6 +19762,11 @@ assert(Error == ASTContext::GE_None && "Unexpected error"); } + if (BuiltinID == RISCV::BI__builtin_riscv_ntl_load) + ICEArguments |= (1 << 1); + if (BuiltinID == RISCV::BI__builtin_riscv_ntl_store) + ICEArguments |= (1 << 2); + for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { // If this is a normal argument, just emit it as a scalar. if ((ICEArguments & (1 << i)) == 0) { @@ -19964,6 +19969,49 @@ IntrinsicTypes = {ResultType}; break; + // Zihintntl + case RISCV::BI__builtin_riscv_ntl_load: { + llvm::Type *ResTy = ConvertType(E->getType()); + ConstantInt *Mode = cast(Ops[1]); + + llvm::MDNode *RISCVDomainNode = llvm::MDNode::get( + getLLVMContext(), + llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue()))); + llvm::MDNode *NontemporalNode = llvm::MDNode::get( + getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1))); + + int Width = ResTy->getPrimitiveSizeInBits(); + LoadInst *Load = Builder.CreateLoad( + Address(Ops[0], ResTy, CharUnits::fromQuantity(Width / 8))); + + Load->setMetadata(CGM.getModule().getMDKindID("nontemporal"), + NontemporalNode); + Load->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"), + RISCVDomainNode); + + return Load; + } + case RISCV::BI__builtin_riscv_ntl_store: { + ConstantInt *Mode = cast(Ops[2]); + + llvm::MDNode *RISCVDomainNode = llvm::MDNode::get( + getLLVMContext(), + llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue()))); + llvm::MDNode *NontemporalNode = llvm::MDNode::get( + getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1))); + + Value *BC = Builder.CreateBitCast( + Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType()), "cast"); + + StoreInst *Store = Builder.CreateDefaultAlignedStore(Ops[1], BC); + Store->setMetadata(CGM.getModule().getMDKindID("nontemporal"), + NontemporalNode); + Store->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"), + RISCVDomainNode); + + return Store; + } + // Vector builtins are handled from here. #include "clang/Basic/riscv_vector_builtin_cg.inc" } diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -98,6 +98,10 @@ htmxlintrin.h ) +set(riscv_files + riscv_ntlh.h + ) + set(systemz_files s390intrin.h vecintrin.h @@ -244,6 +248,7 @@ ${opencl_files} ${ppc_files} ${ppc_htm_files} + ${riscv_files} ${systemz_files} ${ve_files} ${x86_files} @@ -425,7 +430,7 @@ add_header_target("mips-resource-headers" "${mips_msa_files}") add_header_target("ppc-resource-headers" "${ppc_files};${ppc_wrapper_files}") add_header_target("ppc-htm-resource-headers" "${ppc_htm_files}") -add_header_target("riscv-resource-headers" "${riscv_generated_files}") +add_header_target("riscv-resource-headers" "${riscv_files};${riscv_generated_files}") add_header_target("systemz-resource-headers" "${systemz_files}") add_header_target("ve-resource-headers" "${ve_files}") add_header_target("webassembly-resource-headers" "${webassembly_files}") @@ -548,6 +553,12 @@ EXCLUDE_FROM_ALL COMPONENT riscv-resource-headers) +install( + FILES ${riscv_files} + DESTINATION ${header_install_dir} + EXCLUDE_FROM_ALL + COMPONENT riscv-resource-headers) + install( FILES ${systemz_files} DESTINATION ${header_install_dir} diff --git a/clang/lib/Headers/riscv_ntlh.h b/clang/lib/Headers/riscv_ntlh.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/riscv_ntlh.h @@ -0,0 +1,28 @@ +/*===---- riscv_ntlh.h - RISC-V NTLH intrinsics ----------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __RISCV_NTLH_H +#define __RISCV_NTLH_H + +#ifndef __riscv_zihintntl +#error "NTLH intrinsics require the NTLH extension." +#endif + +enum { + __RISCV_NTLH_INNERMOST_PRIVATE = 2, + __RISCV_NTLH_ALL_PRIVATE, + __RISCV_NTLH_INNERMOST_SHARED, + __RISCV_NTLH_ALL +}; + +#define __riscv_ntl_load(PTR, DOMAIN) __builtin_riscv_ntl_load((PTR), (DOMAIN)) +#define __riscv_ntl_store(PTR, VAL, DOMAIN) \ + __builtin_riscv_ntl_store((PTR), (VAL), (DOMAIN)) + +#endif \ No newline at end of file diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -4652,6 +4652,65 @@ // Check if rnum is in [0, 10] case RISCV::BI__builtin_riscv_aes64ks1i_64: return SemaBuiltinConstantArgRange(TheCall, 1, 0, 10); + case RISCV::BI__builtin_riscv_ntl_load: + case RISCV::BI__builtin_riscv_ntl_store: + DeclRefExpr *DRE = + cast(TheCall->getCallee()->IgnoreParenCasts()); + assert((BuiltinID == RISCV::BI__builtin_riscv_ntl_store || + BuiltinID == RISCV::BI__builtin_riscv_ntl_load) && + "Unexpected RISC-V nontemporal load/store builtin!"); + bool IsStore = BuiltinID == RISCV::BI__builtin_riscv_ntl_store; + unsigned NumArgs = IsStore ? 3 : 2; + + if (checkArgCount(*this, TheCall, NumArgs)) + return true; + + // Domain value should be compile-time constant. + // 2 <= domain <= 5 + if (SemaBuiltinConstantArgRange(TheCall, NumArgs - 1, 2, 5)) + return true; + + Expr *PointerArg = TheCall->getArg(0); + ExprResult PointerArgResult = + DefaultFunctionArrayLvalueConversion(PointerArg); + + if (PointerArgResult.isInvalid()) + return true; + PointerArg = PointerArgResult.get(); + + const PointerType *PtrType = PointerArg->getType()->getAs(); + if (!PtrType) { + Diag(DRE->getBeginLoc(), diag::err_nontemporal_builtin_must_be_pointer) + << PointerArg->getType() << PointerArg->getSourceRange(); + return true; + } + + QualType ValType = PtrType->getPointeeType(); + ValType = ValType.getUnqualifiedType(); + if (!ValType->isIntegerType() && !ValType->isAnyPointerType() && + !ValType->isBlockPointerType() && !ValType->isFloatingType() && + !ValType->isVectorType()) { + Diag(DRE->getBeginLoc(), + diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector) + << PointerArg->getType() << PointerArg->getSourceRange(); + return true; + } + + if (!IsStore) { + TheCall->setType(ValType); + return false; + } + + ExprResult ValArg = TheCall->getArg(1); + InitializedEntity Entity = InitializedEntity::InitializeParameter( + Context, ValType, /*consume*/ false); + ValArg = PerformCopyInitialization(Entity, SourceLocation(), ValArg); + if (ValArg.isInvalid()) + return true; + + TheCall->setArg(1, ValArg.get()); + TheCall->setType(Context.VoidTy); + return false; } return false; diff --git a/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c b/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c @@ -0,0 +1,153 @@ +// RUN: %clang_cc1 -triple riscv32 -target-feature +experimental-zihintntl -emit-llvm %s -o - \ +// RUN: | FileCheck %s + +#include + +signed char sc; +unsigned char uc; +signed short ss; +unsigned short us; +signed int si; +unsigned int ui; +signed long long sll; +unsigned long long ull; +_Float16 h1, h2; +float f1, f2; +double d1, d2; +typedef int v4si __attribute__((vector_size(16))); +typedef signed short v8ss __attribute__((vector_size(16))); +typedef signed char v16sc __attribute__((vector_size(16))); +v4si v4si1, v4si2; +v8ss v8ss1, v8ss2; +v16sc v16sc1, v16sc2; + +// clang-format off +void ntl_all_sizes() { // CHECK-LABEL: ntl_all_sizes + uc = __riscv_ntl_load(&sc, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5 + sc = __riscv_ntl_load(&uc, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5 + us = __riscv_ntl_load(&ss, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5 + ss = __riscv_ntl_load(&us, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5 + ui = __riscv_ntl_load(&si, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5 + si = __riscv_ntl_load(&ui, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5 + ull = __riscv_ntl_load(&sll, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5 + sll = __riscv_ntl_load(&ull, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5 + h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5 + f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5 + d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5 + v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5 + v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5 + v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5 + + uc = __riscv_ntl_load(&sc, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6 + sc = __riscv_ntl_load(&uc, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6 + us = __riscv_ntl_load(&ss, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6 + ss = __riscv_ntl_load(&us, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6 + ui = __riscv_ntl_load(&si, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6 + si = __riscv_ntl_load(&ui, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6 + ull = __riscv_ntl_load(&sll, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6 + sll = __riscv_ntl_load(&ull, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6 + h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6 + f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6 + d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6 + v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6 + v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6 + v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6 + + uc = __riscv_ntl_load(&sc, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7 + sc = __riscv_ntl_load(&uc, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7 + us = __riscv_ntl_load(&ss, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7 + ss = __riscv_ntl_load(&us, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7 + ui = __riscv_ntl_load(&si, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7 + si = __riscv_ntl_load(&ui, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7 + ull = __riscv_ntl_load(&sll, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7 + sll = __riscv_ntl_load(&ull, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7 + h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7 + f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7 + d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7 + v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7 + v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7 + v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7 + + uc = __riscv_ntl_load(&sc, __RISCV_NTLH_ALL); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8 + sc = __riscv_ntl_load(&uc, __RISCV_NTLH_ALL); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8 + us = __riscv_ntl_load(&ss, __RISCV_NTLH_ALL); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8 + ss = __riscv_ntl_load(&us, __RISCV_NTLH_ALL); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8 + ui = __riscv_ntl_load(&si, __RISCV_NTLH_ALL); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8 + si = __riscv_ntl_load(&ui, __RISCV_NTLH_ALL); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8 + ull = __riscv_ntl_load(&sll, __RISCV_NTLH_ALL); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8 + sll = __riscv_ntl_load(&ull, __RISCV_NTLH_ALL); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8 + h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_ALL); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8 + f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_ALL); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8 + d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_ALL); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8 + v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_ALL); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8 + v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_ALL); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8 + v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_ALL); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8 + + __riscv_ntl_store(&uc, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&sc, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&us, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&ss, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&ui, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&si, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&ull, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&sll, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5 + __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5 + + __riscv_ntl_store(&uc, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&sc, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&us, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&ss, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&ui, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&si, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&ull, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&sll, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6 + __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6 + + __riscv_ntl_store(&uc, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&sc, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&us, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&ss, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&ui, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&si, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&ull, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&sll, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7 + __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7 + + __riscv_ntl_store(&uc, 1, __RISCV_NTLH_ALL); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&sc, 1, __RISCV_NTLH_ALL); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&us, 1, __RISCV_NTLH_ALL); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&ss, 1, __RISCV_NTLH_ALL); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&ui, 1, __RISCV_NTLH_ALL); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&si, 1, __RISCV_NTLH_ALL); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&ull, 1, __RISCV_NTLH_ALL); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&sll, 1, __RISCV_NTLH_ALL); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_ALL); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_ALL); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_ALL); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_ALL); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_ALL); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8 + __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_ALL); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8 + +} +// clang-format on + +// CHECK: !4 = !{i32 1} +// CHECK: !5 = !{i32 2} +// CHECK: !6 = !{i32 3} +// CHECK: !7 = !{i32 4} +// CHECK: !8 = !{i32 5} \ No newline at end of file diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -433,6 +433,13 @@ return MachineMemOperand::MONone; } + /// This callback is used to inspect load/store SDNode. + /// The default implementation does nothing. + virtual MachineMemOperand::Flags + getTargetMMOFlags(const MemSDNode &Node) const { + return MachineMemOperand::MONone; + } + MachineMemOperand::Flags getLoadMemOperandFlags(const LoadInst &LI, const DataLayout &DL, AssumptionCache *AC = nullptr, @@ -672,6 +679,13 @@ return false; } + /// Return true if it is valid to merge the TargetMMOFlags in two SDNodes. + virtual bool + areTwoSDNodeTargetMMOFlagsMergeable(const MemSDNode &NodeX, + const MemSDNode &NodeY) const { + return true; + } + /// Use bitwise logic to make pairs of compares more efficient. For example: /// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 /// This should be true when it takes more than one instruction to lower diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19370,6 +19370,8 @@ // Don't mix temporal stores with non-temporal stores. if (St->isNonTemporal() != Other->isNonTemporal()) return false; + if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*St, *Other)) + return false; SDValue OtherBC = peekThroughBitcasts(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) @@ -19395,6 +19397,9 @@ // Don't mix temporal loads with non-temporal loads. if (cast(Val)->isNonTemporal() != OtherLd->isNonTemporal()) return false; + if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*cast(Val), + *OtherLd)) + return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; break; @@ -20019,10 +20024,14 @@ if (IsNonTemporalLoad) LdMMOFlags |= MachineMemOperand::MONonTemporal; + LdMMOFlags |= TLI.getTargetMMOFlags(*FirstLoad); + MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore ? MachineMemOperand::MONonTemporal : MachineMemOperand::MONone; + StMMOFlags |= TLI.getTargetMMOFlags(*StoreNodes[0].MemNode); + SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad( diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -480,6 +480,16 @@ // This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + MachineMemOperand::Flags + getTargetMMOFlags(const Instruction &I) const override; + + MachineMemOperand::Flags + getTargetMMOFlags(const MemSDNode &Node) const override; + + bool + areTwoSDNodeTargetMMOFlagsMergeable(const MemSDNode &NodeX, + const MemSDNode &NodeY) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15317,6 +15317,61 @@ return Reg; } +MachineMemOperand::Flags +RISCVTargetLowering::getTargetMMOFlags(const Instruction &I) const { + const MDNode *NontemporalInfo = I.getMetadata(LLVMContext::MD_nontemporal); + + if (NontemporalInfo == nullptr) + return MachineMemOperand::MONone; + + // 1 for default value work as __RISCV_NTLH_ALL + // 2 -> __RISCV_NTLH_INNERMOST_PRIVATE + // 3 -> __RISCV_NTLH_ALL_PRIVATE + // 4 -> __RISCV_NTLH_INNERMOST_SHARED + // 5 -> __RISCV_NTLH_ALL + int NontemporalLevel = 1; + const MDNode *RISCVNontemporalInfo = + I.getMetadata("riscv-nontemporal-domain"); + if (RISCVNontemporalInfo != nullptr) + NontemporalLevel = + cast( + cast(RISCVNontemporalInfo->getOperand(0)) + ->getValue()) + ->getZExtValue(); + + assert((1 <= NontemporalLevel && NontemporalLevel <= 5) && + "RISC-V target doesn't support this non-temporal domain."); + + // Mapping default value into __RISCV_NTLH_ALL + if (NontemporalLevel == 1) + NontemporalLevel = 5; + + NontemporalLevel -= 2; + MachineMemOperand::Flags Flags = MachineMemOperand::MONone; + if (NontemporalLevel & 0b1) + Flags |= MONontemporalBit0; + if (NontemporalLevel & 0b10) + Flags |= MONontemporalBit1; + + return Flags; +} + +MachineMemOperand::Flags +RISCVTargetLowering::getTargetMMOFlags(const MemSDNode &Node) const { + + MachineMemOperand::Flags NodeFlags = Node.getMemOperand()->getFlags(); + MachineMemOperand::Flags TargetFlags = MachineMemOperand::MONone; + TargetFlags |= (NodeFlags & MONontemporalBit0); + TargetFlags |= (NodeFlags & MONontemporalBit1); + + return TargetFlags; +} + +bool RISCVTargetLowering::areTwoSDNodeTargetMMOFlagsMergeable( + const MemSDNode &NodeX, const MemSDNode &NodeY) const { + return getTargetMMOFlags(NodeX) == getTargetMMOFlags(NodeY); +} + namespace llvm::RISCVVIntrinsicsTable { #define GET_RISCVVIntrinsicsTable_IMPL diff --git a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp --- a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp @@ -67,11 +67,27 @@ continue; MachineMemOperand *MMO = *(MBBI.memoperands_begin()); if (MMO->isNonTemporal()) { + uint64_t NontemporalMode = 0; + if (MMO->getFlags() & MONontemporalBit0) + NontemporalMode += 0b1; + if (MMO->getFlags() & MONontemporalBit1) + NontemporalMode += 0b10; + + static const uint16_t NTLOpc[] = { + RISCV::PseudoNTLP1, RISCV::PseudoNTLPALL, RISCV::PseudoNTLS1, + RISCV::PseudoNTLALL}; + static const uint16_t CNTLOpc[] = { + RISCV::PseudoCNTLP1, RISCV::PseudoCNTLPALL, RISCV::PseudoCNTLS1, + RISCV::PseudoCNTLALL}; + + unsigned CurrNTLOpc; DebugLoc DL = MBBI.getDebugLoc(); if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs()) - BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoCNTLALL)); + CurrNTLOpc = CNTLOpc[NontemporalMode]; else - BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoNTLALL)); + CurrNTLOpc = NTLOpc[NontemporalMode]; + + BuildMI(MBB, MBBI, DL, TII->get(CurrNTLOpc)); Changed = true; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -25,6 +25,11 @@ class RISCVSubtarget; +static const MachineMemOperand::Flags MONontemporalBit0 = + MachineMemOperand::MOTargetFlag1; +static const MachineMemOperand::Flags MONontemporalBit1 = + MachineMemOperand::MOTargetFlag2; + namespace RISCVCC { enum CondCode { @@ -227,6 +232,9 @@ std::optional getInverseOpcode(unsigned Opcode) const override; + ArrayRef> + getSerializableMachineMemOperandTargetFlags() const override; + protected: const RISCVSubtarget &STI; }; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2629,6 +2629,14 @@ } } +ArrayRef> +RISCVInstrInfo::getSerializableMachineMemOperandTargetFlags() const { + static const std::pair TargetFlags[] = + {{MONontemporalBit0, "riscv-non-temporal-domain-bit-0"}, + {MONontemporalBit1, "riscv-non-temporal-domain-bit-1"}}; + return makeArrayRef(TargetFlags); +} + // Returns true if this is the sext.w pattern, addiw rd, rs1, 0. bool RISCV::isSEXT_W(const MachineInstr &MI) { return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() && diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td @@ -12,11 +12,23 @@ //===----------------------------------------------------------------------===// let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 4 in { - def PseudoNTLALL : Pseudo<(outs), (ins), [], "ntl.all">, + def PseudoNTLP1 : Pseudo<(outs), (ins), [], "ntl.p1">, + PseudoInstExpansion<(ADD X0, X0, X2)>; + def PseudoNTLPALL : Pseudo<(outs), (ins), [], "ntl.pall">, + PseudoInstExpansion<(ADD X0, X0, X3)>; + def PseudoNTLS1 : Pseudo<(outs), (ins), [], "ntl.s1">, + PseudoInstExpansion<(ADD X0, X0, X4)>; + def PseudoNTLALL : Pseudo<(outs), (ins), [], "ntl.all">, PseudoInstExpansion<(ADD X0, X0, X5)>; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 2 in { - def PseudoCNTLALL : Pseudo<(outs), (ins), [], "c.ntl.all">, + def PseudoCNTLP1 : Pseudo<(outs), (ins), [], "c.ntl.p1">, + PseudoInstExpansion<(C_ADD_HINT X0, X0, X2)>; + def PseudoCNTLPALL : Pseudo<(outs), (ins), [], "c.ntl.pall">, + PseudoInstExpansion<(C_ADD_HINT X0, X0, X3)>; + def PseudoCNTLS1 : Pseudo<(outs), (ins), [], "c.ntl.s1">, + PseudoInstExpansion<(C_ADD_HINT X0, X0, X4)>; + def PseudoCNTLALL : Pseudo<(outs), (ins), [], "c.ntl.all">, PseudoInstExpansion<(C_ADD_HINT X0, X0, X5)>; } diff --git a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll --- a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll +++ b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll @@ -130,4 +130,520 @@ ret void } +define @test_nontemporal_P1_load_nxv2i64(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret %1 +} + +define @test_nontemporal_P1_load_nxv4i32(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret %1 +} + +define @test_nontemporal_P1_load_nxv8i16(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret %1 +} + +define @test_nontemporal_P1_load_nxv16i8(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vl2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vl2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret %1 +} + +define void @test_nontemporal_P1_store_nxv2i64(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_nxv4i32(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_nxv8i16(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_nxv16i8(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define @test_nontemporal_PALL_load_nxv2i64(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret %1 +} + +define @test_nontemporal_PALL_load_nxv4i32(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret %1 +} + +define @test_nontemporal_PALL_load_nxv8i16(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret %1 +} + +define @test_nontemporal_PALL_load_nxv16i8(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vl2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vl2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret %1 +} + +define void @test_nontemporal_PALL_store_nxv2i64(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_nxv4i32(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_nxv8i16(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_nxv16i8(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define @test_nontemporal_S1_load_nxv2i64(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret %1 +} + +define @test_nontemporal_S1_load_nxv4i32(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret %1 +} + +define @test_nontemporal_S1_load_nxv8i16(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret %1 +} + +define @test_nontemporal_S1_load_nxv16i8(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vl2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vl2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret %1 +} + +define void @test_nontemporal_S1_store_nxv2i64(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_nxv4i32(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_nxv8i16(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_nxv16i8(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define @test_nontemporal_ALL_load_nxv2i64(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret %1 +} + +define @test_nontemporal_ALL_load_nxv4i32(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret %1 +} + +define @test_nontemporal_ALL_load_nxv8i16(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret %1 +} + +define @test_nontemporal_ALL_load_nxv16i8(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret %1 +} + +define void @test_nontemporal_ALL_store_nxv2i64(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_nxv4i32(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_nxv8i16(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_nxv16i8(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + !0 = !{i32 1} +!1 = !{i32 2} +!2 = !{i32 3} +!3 = !{i32 4} +!4 = !{i32 5} diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll --- a/llvm/test/CodeGen/RISCV/nontemporal.ll +++ b/llvm/test/CodeGen/RISCV/nontemporal.ll @@ -1438,4 +1438,5665 @@ ret void } +define i64 @test_nontemporal_P1_load_i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a2, 0(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a1, 4(a0) +; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a2, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a1, 4(a0) +; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: ld a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: lw a2, 0(a0) +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: lw a1, 4(a0) +; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ret + %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret i64 %1 +} + +define i32 @test_nontemporal_P1_load_i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: lw a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: lw a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: lw a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: lw a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret i32 %1 +} + +define i16 @test_nontemporal_P1_load_i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: lh a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lh a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: lh a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lh a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: lh a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: lh a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret i16 %1 +} + +define i8 @test_nontemporal_P1_load_i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: lbu a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lbu a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: lbu a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lbu a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: lbu a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: lbu a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret i8 %1 +} + +define half @test_nontemporal_P1_load_half(ptr %p) nounwind { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: flh fa5, 0(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: flh fa4, 6(a0) +; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: flh fa5, 0(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: flh fa4, 6(a0) +; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: flh fa5, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: flh fa4, 6(a0) +; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: flh fa5, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: flh fa4, 6(a0) +; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: flh fa5, 0(a0) +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: flh fa4, 6(a0) +; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: flh fa5, 0(a0) +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: flh fa4, 6(a0) +; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32V-NEXT: ret + %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + %2 = getelementptr half, ptr %p, i32 3 + %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !1 + %4 = fadd half %1, %3 + ret half %4 +} + +define float @test_nontemporal_P1_load_float(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: flw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: flw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: flw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: flw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: flw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: flw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret float %1 +} + +define double @test_nontemporal_P1_load_double(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: fld fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: fld fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: fld fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: fld fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: fld fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: fld fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret double %1 +} + +define <16 x i8> @test_nontemporal_P1_load_v16i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vle8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vle8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret <16 x i8> %1 +} + +define <8 x i16> @test_nontemporal_P1_load_v8i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vle16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vle16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret <8 x i16> %1 +} + +define <4 x i32> @test_nontemporal_P1_load_v4i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vle32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vle32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret <4 x i32> %1 +} + +define <2 x i64> @test_nontemporal_P1_load_v2i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_P1_load_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a2, 0(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a1, 8(a0) +; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_load_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a2, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a1, 8(a0) +; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vle64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vle64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret <2 x i64> %1 +} + +define void @test_nontemporal_P1_store_i64(ptr %p, i64 %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: sd a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: sw a2, 4(a0) +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_i32(ptr %p, i32 %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: sw a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_i16(ptr %p, i16 %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: sh a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: sh a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_i8(ptr %p, i8 %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: sb a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: sb a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_half(ptr %p, half %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: fsh fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: fsh fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: fsh fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: fsh fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: fsh fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: fsh fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_float(ptr %p, float %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: fsw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: fsw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: fsw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: fsw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: fsw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: fsw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_double(ptr %p, double %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: fsd fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: fsd fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: fsd fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: fsd fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: fsd fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: fsd fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: .cfi_offset s0, -8 +; CHECK-RV64-NEXT: .cfi_offset s1, -16 +; CHECK-RV64-NEXT: lbu a2, 0(a1) +; CHECK-RV64-NEXT: lbu a3, 8(a1) +; CHECK-RV64-NEXT: lbu a4, 16(a1) +; CHECK-RV64-NEXT: lbu a5, 24(a1) +; CHECK-RV64-NEXT: lbu a6, 32(a1) +; CHECK-RV64-NEXT: lbu a7, 40(a1) +; CHECK-RV64-NEXT: lbu t0, 48(a1) +; CHECK-RV64-NEXT: lbu t1, 56(a1) +; CHECK-RV64-NEXT: lbu t2, 64(a1) +; CHECK-RV64-NEXT: lbu t3, 72(a1) +; CHECK-RV64-NEXT: lbu t4, 80(a1) +; CHECK-RV64-NEXT: lbu t5, 88(a1) +; CHECK-RV64-NEXT: lbu t6, 120(a1) +; CHECK-RV64-NEXT: lbu s0, 112(a1) +; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a1, 96(a1) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a1, 12(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: .cfi_offset s0, -4 +; CHECK-RV32-NEXT: .cfi_offset s1, -8 +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 4(a1) +; CHECK-RV32-NEXT: lbu a4, 8(a1) +; CHECK-RV32-NEXT: lbu a5, 12(a1) +; CHECK-RV32-NEXT: lbu a6, 16(a1) +; CHECK-RV32-NEXT: lbu a7, 20(a1) +; CHECK-RV32-NEXT: lbu t0, 24(a1) +; CHECK-RV32-NEXT: lbu t1, 28(a1) +; CHECK-RV32-NEXT: lbu t2, 32(a1) +; CHECK-RV32-NEXT: lbu t3, 36(a1) +; CHECK-RV32-NEXT: lbu t4, 40(a1) +; CHECK-RV32-NEXT: lbu t5, 44(a1) +; CHECK-RV32-NEXT: lbu t6, 60(a1) +; CHECK-RV32-NEXT: lbu s0, 56(a1) +; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a1, 48(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a1, 12(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: addi sp, sp, -16 +; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: .cfi_offset s0, -8 +; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu a6, 0(a1) +; CHECK-RV64C-NEXT: lbu a7, 8(a1) +; CHECK-RV64C-NEXT: lbu t0, 16(a1) +; CHECK-RV64C-NEXT: lbu t1, 24(a1) +; CHECK-RV64C-NEXT: lbu t2, 32(a1) +; CHECK-RV64C-NEXT: lbu t3, 40(a1) +; CHECK-RV64C-NEXT: lbu t4, 48(a1) +; CHECK-RV64C-NEXT: lbu t5, 56(a1) +; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu a3, 72(a1) +; CHECK-RV64C-NEXT: lbu a4, 80(a1) +; CHECK-RV64C-NEXT: lbu a5, 88(a1) +; CHECK-RV64C-NEXT: lbu a2, 120(a1) +; CHECK-RV64C-NEXT: lbu s0, 112(a1) +; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu a1, 96(a1) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a1, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a3, 9(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb t2, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb t1, 3(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb t0, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a7, 1(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sb a6, 0(a0) +; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: addi sp, sp, 16 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: addi sp, sp, -16 +; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: .cfi_offset s0, -4 +; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu a6, 0(a1) +; CHECK-RV32C-NEXT: lbu a7, 4(a1) +; CHECK-RV32C-NEXT: lbu t0, 8(a1) +; CHECK-RV32C-NEXT: lbu t1, 12(a1) +; CHECK-RV32C-NEXT: lbu t2, 16(a1) +; CHECK-RV32C-NEXT: lbu t3, 20(a1) +; CHECK-RV32C-NEXT: lbu t4, 24(a1) +; CHECK-RV32C-NEXT: lbu t5, 28(a1) +; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu a3, 36(a1) +; CHECK-RV32C-NEXT: lbu a4, 40(a1) +; CHECK-RV32C-NEXT: lbu a5, 44(a1) +; CHECK-RV32C-NEXT: lbu a2, 60(a1) +; CHECK-RV32C-NEXT: lbu s0, 56(a1) +; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu a1, 48(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a1, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a3, 9(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb t2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb t1, 3(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb t0, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a7, 1(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sb a6, 0(a0) +; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: addi sp, sp, 16 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vse8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vse8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lh a2, 0(a1) +; CHECK-RV64-NEXT: lh a3, 8(a1) +; CHECK-RV64-NEXT: lh a4, 16(a1) +; CHECK-RV64-NEXT: lh a5, 24(a1) +; CHECK-RV64-NEXT: lh a6, 56(a1) +; CHECK-RV64-NEXT: lh a7, 48(a1) +; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a1, 32(a1) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a1, 8(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lh a2, 0(a1) +; CHECK-RV32-NEXT: lh a3, 4(a1) +; CHECK-RV32-NEXT: lh a4, 8(a1) +; CHECK-RV32-NEXT: lh a5, 12(a1) +; CHECK-RV32-NEXT: lh a6, 28(a1) +; CHECK-RV32-NEXT: lh a7, 24(a1) +; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a1, 16(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a1, 8(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a6, 0(a1) +; CHECK-RV64C-NEXT: lh a7, 8(a1) +; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh a5, 24(a1) +; CHECK-RV64C-NEXT: lh a2, 56(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 40(a1) +; CHECK-RV64C-NEXT: lh a1, 32(a1) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a3, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a1, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a5, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sh a6, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a6, 0(a1) +; CHECK-RV32C-NEXT: lh a7, 4(a1) +; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh a5, 12(a1) +; CHECK-RV32C-NEXT: lh a2, 28(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 20(a1) +; CHECK-RV32C-NEXT: lh a1, 16(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a3, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a1, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a5, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sh a6, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vse16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vse16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_v4i32(ptr %p, <4 x i32> %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 24(a1) +; CHECK-RV64-NEXT: lw a3, 16(a1) +; CHECK-RV64-NEXT: lw a4, 8(a1) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sw a2, 12(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sw a3, 8(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sw a4, 4(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lw a2, 24(a1) +; CHECK-RV64C-NEXT: lw a3, 16(a1) +; CHECK-RV64C-NEXT: lw a4, 8(a1) +; CHECK-RV64C-NEXT: lw a1, 0(a1) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sw a2, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sw a3, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sw a4, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vse32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vse32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define void @test_nontemporal_P1_store_v2i64(ptr %p, <2 x i64> %v) { +; CHECK-RV64-LABEL: test_nontemporal_P1_store_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_P1_store_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.p1 +; CHECK-RV64V-NEXT: vse64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: vse64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 + ret void +} + +define i64 @test_nontemporal_PALL_load_i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a2, 0(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a1, 4(a0) +; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a2, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a1, 4(a0) +; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: ld a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: lw a2, 0(a0) +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: lw a1, 4(a0) +; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ret + %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret i64 %1 +} + +define i32 @test_nontemporal_PALL_load_i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: lw a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: lw a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: lw a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: lw a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret i32 %1 +} + +define i16 @test_nontemporal_PALL_load_i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: lh a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lh a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: lh a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lh a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: lh a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: lh a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret i16 %1 +} + +define i8 @test_nontemporal_PALL_load_i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: lbu a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lbu a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: lbu a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lbu a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: lbu a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: lbu a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret i8 %1 +} + +define half @test_nontemporal_PALL_load_half(ptr %p) nounwind { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: flh fa5, 0(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: flh fa4, 6(a0) +; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: flh fa5, 0(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: flh fa4, 6(a0) +; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: flh fa5, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: flh fa4, 6(a0) +; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: flh fa5, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: flh fa4, 6(a0) +; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: flh fa5, 0(a0) +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: flh fa4, 6(a0) +; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: flh fa5, 0(a0) +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: flh fa4, 6(a0) +; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32V-NEXT: ret + %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + %2 = getelementptr half, ptr %p, i32 3 + %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !2 + %4 = fadd half %1, %3 + ret half %4 +} + +define float @test_nontemporal_PALL_load_float(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: flw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: flw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: flw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: flw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: flw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: flw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret float %1 +} + +define double @test_nontemporal_PALL_load_double(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: fld fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: fld fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: fld fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: fld fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: fld fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: fld fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret double %1 +} + +define <16 x i8> @test_nontemporal_PALL_load_v16i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vle8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vle8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret <16 x i8> %1 +} + +define <8 x i16> @test_nontemporal_PALL_load_v8i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vle16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vle16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret <8 x i16> %1 +} + +define <4 x i32> @test_nontemporal_PALL_load_v4i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vle32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vle32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret <4 x i32> %1 +} + +define <2 x i64> @test_nontemporal_PALL_load_v2i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a2, 0(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a1, 8(a0) +; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a2, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a1, 8(a0) +; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vle64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vle64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret <2 x i64> %1 +} + +define void @test_nontemporal_PALL_store_i64(ptr %p, i64 %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: sd a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: sw a2, 4(a0) +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_i32(ptr %p, i32 %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: sw a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_i16(ptr %p, i16 %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: sh a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: sh a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_i8(ptr %p, i8 %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: sb a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: sb a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_half(ptr %p, half %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: fsh fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: fsh fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: fsh fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: fsh fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: fsh fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: fsh fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_float(ptr %p, float %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: fsw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: fsw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: fsw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: fsw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: fsw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: fsw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_double(ptr %p, double %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: fsd fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: fsd fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: fsd fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: fsd fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: fsd fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: fsd fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: .cfi_offset s0, -8 +; CHECK-RV64-NEXT: .cfi_offset s1, -16 +; CHECK-RV64-NEXT: lbu a2, 0(a1) +; CHECK-RV64-NEXT: lbu a3, 8(a1) +; CHECK-RV64-NEXT: lbu a4, 16(a1) +; CHECK-RV64-NEXT: lbu a5, 24(a1) +; CHECK-RV64-NEXT: lbu a6, 32(a1) +; CHECK-RV64-NEXT: lbu a7, 40(a1) +; CHECK-RV64-NEXT: lbu t0, 48(a1) +; CHECK-RV64-NEXT: lbu t1, 56(a1) +; CHECK-RV64-NEXT: lbu t2, 64(a1) +; CHECK-RV64-NEXT: lbu t3, 72(a1) +; CHECK-RV64-NEXT: lbu t4, 80(a1) +; CHECK-RV64-NEXT: lbu t5, 88(a1) +; CHECK-RV64-NEXT: lbu t6, 120(a1) +; CHECK-RV64-NEXT: lbu s0, 112(a1) +; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a1, 96(a1) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a1, 12(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: .cfi_offset s0, -4 +; CHECK-RV32-NEXT: .cfi_offset s1, -8 +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 4(a1) +; CHECK-RV32-NEXT: lbu a4, 8(a1) +; CHECK-RV32-NEXT: lbu a5, 12(a1) +; CHECK-RV32-NEXT: lbu a6, 16(a1) +; CHECK-RV32-NEXT: lbu a7, 20(a1) +; CHECK-RV32-NEXT: lbu t0, 24(a1) +; CHECK-RV32-NEXT: lbu t1, 28(a1) +; CHECK-RV32-NEXT: lbu t2, 32(a1) +; CHECK-RV32-NEXT: lbu t3, 36(a1) +; CHECK-RV32-NEXT: lbu t4, 40(a1) +; CHECK-RV32-NEXT: lbu t5, 44(a1) +; CHECK-RV32-NEXT: lbu t6, 60(a1) +; CHECK-RV32-NEXT: lbu s0, 56(a1) +; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a1, 48(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a1, 12(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: addi sp, sp, -16 +; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: .cfi_offset s0, -8 +; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu a6, 0(a1) +; CHECK-RV64C-NEXT: lbu a7, 8(a1) +; CHECK-RV64C-NEXT: lbu t0, 16(a1) +; CHECK-RV64C-NEXT: lbu t1, 24(a1) +; CHECK-RV64C-NEXT: lbu t2, 32(a1) +; CHECK-RV64C-NEXT: lbu t3, 40(a1) +; CHECK-RV64C-NEXT: lbu t4, 48(a1) +; CHECK-RV64C-NEXT: lbu t5, 56(a1) +; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu a3, 72(a1) +; CHECK-RV64C-NEXT: lbu a4, 80(a1) +; CHECK-RV64C-NEXT: lbu a5, 88(a1) +; CHECK-RV64C-NEXT: lbu a2, 120(a1) +; CHECK-RV64C-NEXT: lbu s0, 112(a1) +; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu a1, 96(a1) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a1, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a3, 9(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb t2, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb t1, 3(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb t0, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a7, 1(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sb a6, 0(a0) +; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: addi sp, sp, 16 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: addi sp, sp, -16 +; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: .cfi_offset s0, -4 +; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu a6, 0(a1) +; CHECK-RV32C-NEXT: lbu a7, 4(a1) +; CHECK-RV32C-NEXT: lbu t0, 8(a1) +; CHECK-RV32C-NEXT: lbu t1, 12(a1) +; CHECK-RV32C-NEXT: lbu t2, 16(a1) +; CHECK-RV32C-NEXT: lbu t3, 20(a1) +; CHECK-RV32C-NEXT: lbu t4, 24(a1) +; CHECK-RV32C-NEXT: lbu t5, 28(a1) +; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu a3, 36(a1) +; CHECK-RV32C-NEXT: lbu a4, 40(a1) +; CHECK-RV32C-NEXT: lbu a5, 44(a1) +; CHECK-RV32C-NEXT: lbu a2, 60(a1) +; CHECK-RV32C-NEXT: lbu s0, 56(a1) +; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu a1, 48(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a1, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a3, 9(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb t2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb t1, 3(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb t0, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a7, 1(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sb a6, 0(a0) +; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: addi sp, sp, 16 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vse8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vse8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lh a2, 0(a1) +; CHECK-RV64-NEXT: lh a3, 8(a1) +; CHECK-RV64-NEXT: lh a4, 16(a1) +; CHECK-RV64-NEXT: lh a5, 24(a1) +; CHECK-RV64-NEXT: lh a6, 56(a1) +; CHECK-RV64-NEXT: lh a7, 48(a1) +; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a1, 32(a1) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a1, 8(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lh a2, 0(a1) +; CHECK-RV32-NEXT: lh a3, 4(a1) +; CHECK-RV32-NEXT: lh a4, 8(a1) +; CHECK-RV32-NEXT: lh a5, 12(a1) +; CHECK-RV32-NEXT: lh a6, 28(a1) +; CHECK-RV32-NEXT: lh a7, 24(a1) +; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a1, 16(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a1, 8(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a6, 0(a1) +; CHECK-RV64C-NEXT: lh a7, 8(a1) +; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh a5, 24(a1) +; CHECK-RV64C-NEXT: lh a2, 56(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 40(a1) +; CHECK-RV64C-NEXT: lh a1, 32(a1) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a3, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a1, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a5, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sh a6, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a6, 0(a1) +; CHECK-RV32C-NEXT: lh a7, 4(a1) +; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh a5, 12(a1) +; CHECK-RV32C-NEXT: lh a2, 28(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 20(a1) +; CHECK-RV32C-NEXT: lh a1, 16(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a3, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a1, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a5, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sh a6, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vse16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vse16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_v4i32(ptr %p, <4 x i32> %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 24(a1) +; CHECK-RV64-NEXT: lw a3, 16(a1) +; CHECK-RV64-NEXT: lw a4, 8(a1) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sw a2, 12(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sw a3, 8(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sw a4, 4(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lw a2, 24(a1) +; CHECK-RV64C-NEXT: lw a3, 16(a1) +; CHECK-RV64C-NEXT: lw a4, 8(a1) +; CHECK-RV64C-NEXT: lw a1, 0(a1) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sw a2, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sw a3, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sw a4, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vse32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vse32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define void @test_nontemporal_PALL_store_v2i64(ptr %p, <2 x i64> %v) { +; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.pall +; CHECK-RV64V-NEXT: vse64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: vse64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 + ret void +} + +define i64 @test_nontemporal_S1_load_i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a2, 0(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a1, 4(a0) +; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a2, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a1, 4(a0) +; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: ld a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: lw a2, 0(a0) +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: lw a1, 4(a0) +; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ret + %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret i64 %1 +} + +define i32 @test_nontemporal_S1_load_i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: lw a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: lw a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: lw a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: lw a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret i32 %1 +} + +define i16 @test_nontemporal_S1_load_i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: lh a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lh a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: lh a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lh a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: lh a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: lh a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret i16 %1 +} + +define i8 @test_nontemporal_S1_load_i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: lbu a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lbu a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: lbu a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lbu a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: lbu a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: lbu a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret i8 %1 +} + +define half @test_nontemporal_S1_load_half(ptr %p) nounwind { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: flh fa5, 0(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: flh fa4, 6(a0) +; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: flh fa5, 0(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: flh fa4, 6(a0) +; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: flh fa5, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: flh fa4, 6(a0) +; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: flh fa5, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: flh fa4, 6(a0) +; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: flh fa5, 0(a0) +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: flh fa4, 6(a0) +; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: flh fa5, 0(a0) +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: flh fa4, 6(a0) +; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32V-NEXT: ret + %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + %2 = getelementptr half, ptr %p, i32 3 + %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !3 + %4 = fadd half %1, %3 + ret half %4 +} + +define float @test_nontemporal_S1_load_float(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: flw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: flw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: flw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: flw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: flw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: flw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret float %1 +} + +define double @test_nontemporal_S1_load_double(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: fld fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: fld fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: fld fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: fld fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: fld fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: fld fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret double %1 +} + +define <16 x i8> @test_nontemporal_S1_load_v16i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vle8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vle8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret <16 x i8> %1 +} + +define <8 x i16> @test_nontemporal_S1_load_v8i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vle16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vle16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret <8 x i16> %1 +} + +define <4 x i32> @test_nontemporal_S1_load_v4i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vle32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vle32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret <4 x i32> %1 +} + +define <2 x i64> @test_nontemporal_S1_load_v2i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_S1_load_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a2, 0(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a1, 8(a0) +; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_load_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a2, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a1, 8(a0) +; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vle64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vle64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret <2 x i64> %1 +} + +define void @test_nontemporal_S1_store_i64(ptr %p, i64 %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: sd a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: sw a2, 4(a0) +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_i32(ptr %p, i32 %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: sw a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_i16(ptr %p, i16 %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: sh a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: sh a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_i8(ptr %p, i8 %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: sb a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: sb a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_half(ptr %p, half %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: fsh fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: fsh fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: fsh fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: fsh fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: fsh fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: fsh fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_float(ptr %p, float %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: fsw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: fsw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: fsw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: fsw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: fsw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: fsw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_double(ptr %p, double %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: fsd fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: fsd fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: fsd fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: fsd fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: fsd fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: fsd fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: .cfi_offset s0, -8 +; CHECK-RV64-NEXT: .cfi_offset s1, -16 +; CHECK-RV64-NEXT: lbu a2, 0(a1) +; CHECK-RV64-NEXT: lbu a3, 8(a1) +; CHECK-RV64-NEXT: lbu a4, 16(a1) +; CHECK-RV64-NEXT: lbu a5, 24(a1) +; CHECK-RV64-NEXT: lbu a6, 32(a1) +; CHECK-RV64-NEXT: lbu a7, 40(a1) +; CHECK-RV64-NEXT: lbu t0, 48(a1) +; CHECK-RV64-NEXT: lbu t1, 56(a1) +; CHECK-RV64-NEXT: lbu t2, 64(a1) +; CHECK-RV64-NEXT: lbu t3, 72(a1) +; CHECK-RV64-NEXT: lbu t4, 80(a1) +; CHECK-RV64-NEXT: lbu t5, 88(a1) +; CHECK-RV64-NEXT: lbu t6, 120(a1) +; CHECK-RV64-NEXT: lbu s0, 112(a1) +; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a1, 96(a1) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a1, 12(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: .cfi_offset s0, -4 +; CHECK-RV32-NEXT: .cfi_offset s1, -8 +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 4(a1) +; CHECK-RV32-NEXT: lbu a4, 8(a1) +; CHECK-RV32-NEXT: lbu a5, 12(a1) +; CHECK-RV32-NEXT: lbu a6, 16(a1) +; CHECK-RV32-NEXT: lbu a7, 20(a1) +; CHECK-RV32-NEXT: lbu t0, 24(a1) +; CHECK-RV32-NEXT: lbu t1, 28(a1) +; CHECK-RV32-NEXT: lbu t2, 32(a1) +; CHECK-RV32-NEXT: lbu t3, 36(a1) +; CHECK-RV32-NEXT: lbu t4, 40(a1) +; CHECK-RV32-NEXT: lbu t5, 44(a1) +; CHECK-RV32-NEXT: lbu t6, 60(a1) +; CHECK-RV32-NEXT: lbu s0, 56(a1) +; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a1, 48(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a1, 12(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: addi sp, sp, -16 +; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: .cfi_offset s0, -8 +; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu a6, 0(a1) +; CHECK-RV64C-NEXT: lbu a7, 8(a1) +; CHECK-RV64C-NEXT: lbu t0, 16(a1) +; CHECK-RV64C-NEXT: lbu t1, 24(a1) +; CHECK-RV64C-NEXT: lbu t2, 32(a1) +; CHECK-RV64C-NEXT: lbu t3, 40(a1) +; CHECK-RV64C-NEXT: lbu t4, 48(a1) +; CHECK-RV64C-NEXT: lbu t5, 56(a1) +; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu a3, 72(a1) +; CHECK-RV64C-NEXT: lbu a4, 80(a1) +; CHECK-RV64C-NEXT: lbu a5, 88(a1) +; CHECK-RV64C-NEXT: lbu a2, 120(a1) +; CHECK-RV64C-NEXT: lbu s0, 112(a1) +; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu a1, 96(a1) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a1, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a3, 9(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb t2, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb t1, 3(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb t0, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a7, 1(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sb a6, 0(a0) +; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: addi sp, sp, 16 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: addi sp, sp, -16 +; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: .cfi_offset s0, -4 +; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu a6, 0(a1) +; CHECK-RV32C-NEXT: lbu a7, 4(a1) +; CHECK-RV32C-NEXT: lbu t0, 8(a1) +; CHECK-RV32C-NEXT: lbu t1, 12(a1) +; CHECK-RV32C-NEXT: lbu t2, 16(a1) +; CHECK-RV32C-NEXT: lbu t3, 20(a1) +; CHECK-RV32C-NEXT: lbu t4, 24(a1) +; CHECK-RV32C-NEXT: lbu t5, 28(a1) +; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu a3, 36(a1) +; CHECK-RV32C-NEXT: lbu a4, 40(a1) +; CHECK-RV32C-NEXT: lbu a5, 44(a1) +; CHECK-RV32C-NEXT: lbu a2, 60(a1) +; CHECK-RV32C-NEXT: lbu s0, 56(a1) +; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu a1, 48(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a1, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a3, 9(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb t2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb t1, 3(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb t0, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a7, 1(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sb a6, 0(a0) +; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: addi sp, sp, 16 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vse8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vse8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lh a2, 0(a1) +; CHECK-RV64-NEXT: lh a3, 8(a1) +; CHECK-RV64-NEXT: lh a4, 16(a1) +; CHECK-RV64-NEXT: lh a5, 24(a1) +; CHECK-RV64-NEXT: lh a6, 56(a1) +; CHECK-RV64-NEXT: lh a7, 48(a1) +; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a1, 32(a1) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a1, 8(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lh a2, 0(a1) +; CHECK-RV32-NEXT: lh a3, 4(a1) +; CHECK-RV32-NEXT: lh a4, 8(a1) +; CHECK-RV32-NEXT: lh a5, 12(a1) +; CHECK-RV32-NEXT: lh a6, 28(a1) +; CHECK-RV32-NEXT: lh a7, 24(a1) +; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a1, 16(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a1, 8(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a6, 0(a1) +; CHECK-RV64C-NEXT: lh a7, 8(a1) +; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh a5, 24(a1) +; CHECK-RV64C-NEXT: lh a2, 56(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 40(a1) +; CHECK-RV64C-NEXT: lh a1, 32(a1) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a3, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a1, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a5, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sh a6, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a6, 0(a1) +; CHECK-RV32C-NEXT: lh a7, 4(a1) +; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh a5, 12(a1) +; CHECK-RV32C-NEXT: lh a2, 28(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 20(a1) +; CHECK-RV32C-NEXT: lh a1, 16(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a3, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a1, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a5, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sh a6, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vse16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vse16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_v4i32(ptr %p, <4 x i32> %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 24(a1) +; CHECK-RV64-NEXT: lw a3, 16(a1) +; CHECK-RV64-NEXT: lw a4, 8(a1) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sw a2, 12(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sw a3, 8(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sw a4, 4(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lw a2, 24(a1) +; CHECK-RV64C-NEXT: lw a3, 16(a1) +; CHECK-RV64C-NEXT: lw a4, 8(a1) +; CHECK-RV64C-NEXT: lw a1, 0(a1) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sw a2, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sw a3, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sw a4, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vse32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vse32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define void @test_nontemporal_S1_store_v2i64(ptr %p, <2 x i64> %v) { +; CHECK-RV64-LABEL: test_nontemporal_S1_store_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_S1_store_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.s1 +; CHECK-RV64V-NEXT: vse64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: vse64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 + ret void +} + +define i64 @test_nontemporal_ALL_load_i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 0(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 4(a0) +; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 4(a0) +; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: ld a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a2, 0(a0) +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a1, 4(a0) +; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ret + %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret i64 %1 +} + +define i32 @test_nontemporal_ALL_load_i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: lw a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: lw a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: lw a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret i32 %1 +} + +define i16 @test_nontemporal_ALL_load_i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: lh a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lh a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: lh a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lh a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: lh a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lh a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret i16 %1 +} + +define i8 @test_nontemporal_ALL_load_i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: lbu a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lbu a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: lbu a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lbu a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: lbu a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lbu a0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret i8 %1 +} + +define half @test_nontemporal_ALL_load_half(ptr %p) nounwind { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: flh fa5, 0(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: flh fa4, 6(a0) +; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: flh fa5, 0(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: flh fa4, 6(a0) +; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: flh fa5, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: flh fa4, 6(a0) +; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: flh fa5, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: flh fa4, 6(a0) +; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: flh fa5, 0(a0) +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: flh fa4, 6(a0) +; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: flh fa5, 0(a0) +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: flh fa4, 6(a0) +; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32V-NEXT: ret + %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + %2 = getelementptr half, ptr %p, i32 3 + %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !4 + %4 = fadd half %1, %3 + ret half %4 +} + +define float @test_nontemporal_ALL_load_float(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: flw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: flw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: flw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: flw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: flw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: flw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret float %1 +} + +define double @test_nontemporal_ALL_load_double(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fld fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fld fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fld fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fld fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fld fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fld fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret double %1 +} + +define <16 x i8> @test_nontemporal_ALL_load_v16i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret <16 x i8> %1 +} + +define <8 x i16> @test_nontemporal_ALL_load_v8i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret <8 x i16> %1 +} + +define <4 x i32> @test_nontemporal_ALL_load_v4i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret <4 x i32> %1 +} + +define <2 x i64> @test_nontemporal_ALL_load_v2i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 0(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 8(a0) +; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 8(a0) +; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret <2 x i64> %1 +} + +define void @test_nontemporal_ALL_store_i64(ptr %p, i64 %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sd a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sw a2, 4(a0) +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_i32(ptr %p, i32 %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sw a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_i16(ptr %p, i16 %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sh a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sh a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_i8(ptr %p, i8 %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sb a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sb a1, 0(a0) +; CHECK-RV32V-NEXT: ret + store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_half(ptr %p, half %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fsh fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fsh fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fsh fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fsh fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fsh fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fsh fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_float(ptr %p, float %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fsw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fsw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fsw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fsw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fsw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fsw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_double(ptr %p, double %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fsd fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fsd fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fsd fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fsd fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fsd fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fsd fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: .cfi_offset s0, -8 +; CHECK-RV64-NEXT: .cfi_offset s1, -16 +; CHECK-RV64-NEXT: lbu a2, 0(a1) +; CHECK-RV64-NEXT: lbu a3, 8(a1) +; CHECK-RV64-NEXT: lbu a4, 16(a1) +; CHECK-RV64-NEXT: lbu a5, 24(a1) +; CHECK-RV64-NEXT: lbu a6, 32(a1) +; CHECK-RV64-NEXT: lbu a7, 40(a1) +; CHECK-RV64-NEXT: lbu t0, 48(a1) +; CHECK-RV64-NEXT: lbu t1, 56(a1) +; CHECK-RV64-NEXT: lbu t2, 64(a1) +; CHECK-RV64-NEXT: lbu t3, 72(a1) +; CHECK-RV64-NEXT: lbu t4, 80(a1) +; CHECK-RV64-NEXT: lbu t5, 88(a1) +; CHECK-RV64-NEXT: lbu t6, 120(a1) +; CHECK-RV64-NEXT: lbu s0, 112(a1) +; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a1, 96(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a1, 12(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: .cfi_offset s0, -4 +; CHECK-RV32-NEXT: .cfi_offset s1, -8 +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 4(a1) +; CHECK-RV32-NEXT: lbu a4, 8(a1) +; CHECK-RV32-NEXT: lbu a5, 12(a1) +; CHECK-RV32-NEXT: lbu a6, 16(a1) +; CHECK-RV32-NEXT: lbu a7, 20(a1) +; CHECK-RV32-NEXT: lbu t0, 24(a1) +; CHECK-RV32-NEXT: lbu t1, 28(a1) +; CHECK-RV32-NEXT: lbu t2, 32(a1) +; CHECK-RV32-NEXT: lbu t3, 36(a1) +; CHECK-RV32-NEXT: lbu t4, 40(a1) +; CHECK-RV32-NEXT: lbu t5, 44(a1) +; CHECK-RV32-NEXT: lbu t6, 60(a1) +; CHECK-RV32-NEXT: lbu s0, 56(a1) +; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a1, 48(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a1, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: addi sp, sp, -16 +; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: .cfi_offset s0, -8 +; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu a6, 0(a1) +; CHECK-RV64C-NEXT: lbu a7, 8(a1) +; CHECK-RV64C-NEXT: lbu t0, 16(a1) +; CHECK-RV64C-NEXT: lbu t1, 24(a1) +; CHECK-RV64C-NEXT: lbu t2, 32(a1) +; CHECK-RV64C-NEXT: lbu t3, 40(a1) +; CHECK-RV64C-NEXT: lbu t4, 48(a1) +; CHECK-RV64C-NEXT: lbu t5, 56(a1) +; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu a3, 72(a1) +; CHECK-RV64C-NEXT: lbu a4, 80(a1) +; CHECK-RV64C-NEXT: lbu a5, 88(a1) +; CHECK-RV64C-NEXT: lbu a2, 120(a1) +; CHECK-RV64C-NEXT: lbu s0, 112(a1) +; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu a1, 96(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a1, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a3, 9(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t2, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t1, 3(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t0, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a7, 1(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a6, 0(a0) +; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: addi sp, sp, 16 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: addi sp, sp, -16 +; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: .cfi_offset s0, -4 +; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu a6, 0(a1) +; CHECK-RV32C-NEXT: lbu a7, 4(a1) +; CHECK-RV32C-NEXT: lbu t0, 8(a1) +; CHECK-RV32C-NEXT: lbu t1, 12(a1) +; CHECK-RV32C-NEXT: lbu t2, 16(a1) +; CHECK-RV32C-NEXT: lbu t3, 20(a1) +; CHECK-RV32C-NEXT: lbu t4, 24(a1) +; CHECK-RV32C-NEXT: lbu t5, 28(a1) +; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu a3, 36(a1) +; CHECK-RV32C-NEXT: lbu a4, 40(a1) +; CHECK-RV32C-NEXT: lbu a5, 44(a1) +; CHECK-RV32C-NEXT: lbu a2, 60(a1) +; CHECK-RV32C-NEXT: lbu s0, 56(a1) +; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu a1, 48(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a1, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a3, 9(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t1, 3(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t0, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a7, 1(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a6, 0(a0) +; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: addi sp, sp, 16 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lh a2, 0(a1) +; CHECK-RV64-NEXT: lh a3, 8(a1) +; CHECK-RV64-NEXT: lh a4, 16(a1) +; CHECK-RV64-NEXT: lh a5, 24(a1) +; CHECK-RV64-NEXT: lh a6, 56(a1) +; CHECK-RV64-NEXT: lh a7, 48(a1) +; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a1, 32(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a1, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lh a2, 0(a1) +; CHECK-RV32-NEXT: lh a3, 4(a1) +; CHECK-RV32-NEXT: lh a4, 8(a1) +; CHECK-RV32-NEXT: lh a5, 12(a1) +; CHECK-RV32-NEXT: lh a6, 28(a1) +; CHECK-RV32-NEXT: lh a7, 24(a1) +; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a1, 16(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a1, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a6, 0(a1) +; CHECK-RV64C-NEXT: lh a7, 8(a1) +; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh a5, 24(a1) +; CHECK-RV64C-NEXT: lh a2, 56(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 40(a1) +; CHECK-RV64C-NEXT: lh a1, 32(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a3, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a1, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a5, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a6, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a6, 0(a1) +; CHECK-RV32C-NEXT: lh a7, 4(a1) +; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh a5, 12(a1) +; CHECK-RV32C-NEXT: lh a2, 28(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 20(a1) +; CHECK-RV32C-NEXT: lh a1, 16(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a3, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a1, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a5, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a6, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_v4i32(ptr %p, <4 x i32> %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 24(a1) +; CHECK-RV64-NEXT: lw a3, 16(a1) +; CHECK-RV64-NEXT: lw a4, 8(a1) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a2, 12(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a3, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a4, 4(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lw a2, 24(a1) +; CHECK-RV64C-NEXT: lw a3, 16(a1) +; CHECK-RV64C-NEXT: lw a4, 8(a1) +; CHECK-RV64C-NEXT: lw a1, 0(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a2, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a3, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a4, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + +define void @test_nontemporal_ALL_store_v2i64(ptr %p, <2 x i64> %v) { +; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 + ret void +} + + !0 = !{i32 1} +!1 = !{i32 2} +!2 = !{i32 3} +!3 = !{i32 4} +!4 = !{i32 5}