diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -971,10 +971,6 @@ : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, ImmArg>]>; - class AdvSIMD_ManyVec_PredLoad_Intrinsic - : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMPointerToElt<0>], - [IntrReadMem, IntrArgMemOnly]>; - class AdvSIMD_1Vec_PredLoad_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1569,10 +1565,6 @@ def int_aarch64_sve_ld1 : AdvSIMD_1Vec_PredLoad_Intrinsic; -def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic; -def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic; -def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic; - def int_aarch64_sve_ld2_sret : AdvSIMD_2Vec_PredLoad_Intrinsic; def int_aarch64_sve_ld3_sret : AdvSIMD_3Vec_PredLoad_Intrinsic; def int_aarch64_sve_ld4_sret : AdvSIMD_4Vec_PredLoad_Intrinsic; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -558,6 +558,22 @@ F->arg_begin()->getType()); return true; } + static const Regex LdRegex("^aarch64\\.sve\\.ld[234](.nxv[a-z0-9]+|$)"); + if (LdRegex.match(Name)) { + Type *ScalarTy = + dyn_cast(F->getReturnType())->getElementType(); + ElementCount EC = + dyn_cast(F->arg_begin()->getType())->getElementCount(); + Type *Ty = VectorType::get(ScalarTy, EC); + Intrinsic::ID ID = + StringSwitch(Name) + .StartsWith("aarch64.sve.ld2", Intrinsic::aarch64_sve_ld2_sret) + .StartsWith("aarch64.sve.ld3", Intrinsic::aarch64_sve_ld3_sret) + .StartsWith("aarch64.sve.ld4", Intrinsic::aarch64_sve_ld4_sret) + .Default(Intrinsic::not_intrinsic); + NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Ty); + return true; + } if (Name.startswith("arm.neon.vclz")) { Type* args[2] = { F->arg_begin()->getType(), @@ -3858,7 +3874,30 @@ NewCall = Builder.CreateCall(NewFn, Args); break; } - + case Intrinsic::aarch64_sve_ld3_sret: + case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld2_sret: { + StringRef Name = F->getName(); + Name = Name.substr(5); + unsigned N = StringSwitch(Name) + .StartsWith("aarch64.sve.ld2", 2) + .StartsWith("aarch64.sve.ld3", 3) + .StartsWith("aarch64.sve.ld4", 4) + .Default(0); + ScalableVectorType *RetTy = + dyn_cast(F->getReturnType()); + unsigned MinElts = RetTy->getMinNumElements() / N; + SmallVector Args(CI->args()); + Value *NewLdCall = Builder.CreateCall(NewFn, Args); + Value *Ret = llvm::PoisonValue::get(RetTy); + for (unsigned I = 0; I < N; I++) { + Value *Idx = ConstantInt::get(Type::getInt64Ty(C), I * MinElts); + Value *SRet = Builder.CreateExtractValue(NewLdCall, I); + Ret = Builder.CreateInsertVector(RetTy, Ret, SRet, Idx); + } + NewCall = dyn_cast(Ret); + break; + } case Intrinsic::arm_neon_bfdot: case Intrinsic::arm_neon_bfmmla: case Intrinsic::arm_neon_bfmlalb: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1041,8 +1041,6 @@ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; - SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef LoadOps, - EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13840,61 +13840,6 @@ return true; } -// Lower an SVE structured load intrinsic returning a tuple type to target -// specific intrinsic taking the same input but returning a multi-result value -// of the split tuple type. -// -// E.g. Lowering an LD3: -// -// call @llvm.aarch64.sve.ld3.nxv12i32( -// %pred, -// * %addr) -// -// Output DAG: -// -// t0: ch = EntryToken -// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 -// t4: i64,ch = CopyFromReg t0, Register:i64 %1 -// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 -// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 -// -// This is called pre-legalization to avoid widening/splitting issues with -// non-power-of-2 tuple types used for LD3, such as nxv12i32. -SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, - ArrayRef LoadOps, - EVT VT, SelectionDAG &DAG, - const SDLoc &DL) const { - assert(VT.isScalableVector() && "Can only lower scalable vectors"); - - unsigned N, Opcode; - static const std::pair> - IntrinsicMap[] = { - {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, - {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, - {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; - - std::tie(N, Opcode) = llvm::find_if(IntrinsicMap, [&](auto P) { - return P.first == Intrinsic; - })->second; - assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 && - "invalid tuple vector type!"); - - EVT SplitVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorElementCount().divideCoefficientBy(N)); - assert(isTypeLegal(SplitVT)); - - SmallVector VTs(N, SplitVT); - VTs.push_back(MVT::Other); // Chain - SDVTList NodeTys = DAG.getVTList(VTs); - - SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); - SmallVector PseudoLoadOps; - for (unsigned I = 0; I < N; ++I) - PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); -} - EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); @@ -20400,20 +20345,6 @@ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); return DAG.getMergeValues({Concat, Chain}, DL); } - case Intrinsic::aarch64_sve_ld2: - case Intrinsic::aarch64_sve_ld3: - case Intrinsic::aarch64_sve_ld4: { - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Mask = N->getOperand(2); - SDValue BasePtr = N->getOperand(3); - SDValue LoadOps[] = {Chain, Mask, BasePtr}; - unsigned IntrinsicID = - cast(N->getOperand(1))->getZExtValue(); - SDValue Result = - LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); - return DAG.getMergeValues({Result, Chain}, DL); - } case Intrinsic::aarch64_rndr: case Intrinsic::aarch64_rndrrs: { unsigned IntrinsicID = diff --git a/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll @@ -0,0 +1,74 @@ +; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s + +define @ld2.nxv32i8( %Pg, i8 *%base_ptr) { +; CHECK: %1 = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( %Pg, i8* %base_ptr) +; CHECK-NEXT: %2 = extractvalue { , } %1, 0 +; CHECK-NEXT: %3 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %2, i64 0) +; CHECK-NEXT: %4 = extractvalue { , } %1, 1 +; CHECK-NEXT: %res = call @llvm.vector.insert.nxv32i8.nxv16i8( %3, %4, i64 16) +; CHECK-NEXT: ret %res +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8( %Pg, i8 *%base_ptr) { +; CHECK: %1 = call { , , } @llvm.aarch64.sve.ld3.sret.nxv16i8( %Pg, i8* %base_ptr) +; CHECK-NEXT: %2 = extractvalue { , , } %1, 0 +; CHECK-NEXT: %3 = call @llvm.vector.insert.nxv48i8.nxv16i8( poison, %2, i64 0) +; CHECK-NEXT: %4 = extractvalue { , , } %1, 1 +; CHECK-NEXT: %5 = call @llvm.vector.insert.nxv48i8.nxv16i8( %3, %4, i64 16) +; CHECK-NEXT: %6 = extractvalue { , , } %1, 2 +; CHECK-NEXT: %res = call @llvm.vector.insert.nxv48i8.nxv16i8( %5, %6, i64 32) +; CHECK-NEXT: ret %res +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_lower_bound( %Pg, i8 *%base_ptr) { +; CHECK: %1 = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv16i8( %Pg, i8* %base_ptr) +; CHECK-NEXT: %2 = extractvalue { , , , } %1, 0 +; CHECK-NEXT: %3 = call @llvm.vector.insert.nxv64i8.nxv16i8( poison, %2, i64 0) +; CHECK-NEXT: %4 = extractvalue { , , , } %1, 1 +; CHECK-NEXT: %5 = call @llvm.vector.insert.nxv64i8.nxv16i8( %3, %4, i64 16) +; CHECK-NEXT: %6 = extractvalue { , , , } %1, 2 +; CHECK-NEXT: %7 = call @llvm.vector.insert.nxv64i8.nxv16i8( %5, %6, i64 32) +; CHECK-NEXT: %8 = extractvalue { , , , } %1, 3 +; CHECK-NEXT: %res = call @llvm.vector.insert.nxv64i8.nxv16i8( %7, %8, i64 48) +; CHECK-NEXT: ret %res +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +; Check short mangling name + +; ldN intrinsic name without any element type +define @ld2.nxv32i8_no_eltty( %Pg, i8 *%base_ptr) { +; CHECK-LABEL: @ld2.nxv32i8_no_eltty +; CHECK: %1 = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( %Pg, i8* %base_ptr) +; CHECK-NEXT: %2 = extractvalue { , } %1, 0 +; CHECK-NEXT: %3 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %2, i64 0) +; CHECK-NEXT: %4 = extractvalue { , } %1, 1 +; CHECK-NEXT: %res = call @llvm.vector.insert.nxv32i8.nxv16i8( %3, %4, i64 16) +; CHECK-NEXT: ret %res +%res = call @llvm.aarch64.sve.ld2( %Pg, i8 *%base_ptr) +ret %res +} + +; ldN instrinsic name with only output type +define @ld2.nxv32i8_no_predty_pty( %Pg, i8 *%base_ptr) { +; CHECK-LABEL: @ld2.nxv32i8_no_predty_pty +; CHECK: %1 = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( %Pg, i8* %base_ptr) +; CHECK-NEXT: %2 = extractvalue { , } %1, 0 +; CHECK-NEXT: %3 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %2, i64 0) +; CHECK-NEXT: %4 = extractvalue { , } %1, 1 +; CHECK-NEXT: %res = call @llvm.vector.insert.nxv32i8.nxv16i8( %3, %4, i64 16) +; CHECK-NEXT: ret %res +%res = call @llvm.aarch64.sve.ld2.nxv32i8( %Pg, i8 *%base_ptr) +ret %res +} + +declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld2(, i8 *) +declare @llvm.aarch64.sve.ld2.nxv32i8(, i8 *) diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -29,10 +29,26 @@ entry: %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) + %2 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x0) + %3 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x1) %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) - %call = call float @callee1(float 1.000000e+00, %2, %3, %4) + %5 = extractvalue { , , , } %2, 0 + %6 = extractvalue { , , , } %2, 1 + %7 = extractvalue { , , , } %2, 2 + %8 = extractvalue { , , , } %2, 3 + %9 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %5, i64 0) + %10 = call @llvm.vector.insert.nxv8f64.nx2f64( %9, %6, i64 2) + %11 = call @llvm.vector.insert.nxv8f64.nx2f64( %10, %7, i64 4) + %12 = call @llvm.vector.insert.nxv8f64.nx2f64( %11, %8, i64 6) + %13 = extractvalue { , , , } %3, 0 + %14 = extractvalue { , , , } %3, 1 + %15 = extractvalue { , , , } %3, 2 + %16 = extractvalue { , , , } %3, 3 + %17 = call @llvm.vector.insert.nxv8f64.nx2f64( undef, %13, i64 0) + %18 = call @llvm.vector.insert.nxv8f64.nx2f64( %17, %14, i64 2) + %19 = call @llvm.vector.insert.nxv8f64.nx2f64( %18, %15, i64 4) + %20 = call @llvm.vector.insert.nxv8f64.nx2f64( %19, %16, i64 6) + %call = call float @callee1(float 1.000000e+00, %12, %20, %4) ret float %call } @@ -73,9 +89,25 @@ entry: %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) - %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, %2, %3) + %2 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x0) + %3 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x1) + %4 = extractvalue { , , , } %2, 0 + %5 = extractvalue { , , , } %2, 1 + %6 = extractvalue { , , , } %2, 2 + %7 = extractvalue { , , , } %2, 3 + %8 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %4, i64 0) + %9 = call @llvm.vector.insert.nxv8f64.nx2f64( %8, %5, i64 2) + %10 = call @llvm.vector.insert.nxv8f64.nx2f64( %9, %6, i64 4) + %11 = call @llvm.vector.insert.nxv8f64.nx2f64( %10, %7, i64 6) + %12 = extractvalue { , , , } %3, 0 + %13 = extractvalue { , , , } %3, 1 + %14 = extractvalue { , , , } %3, 2 + %15 = extractvalue { , , , } %3, 3 + %16 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %12, i64 0) + %17 = call @llvm.vector.insert.nxv8f64.nx2f64( %16, %13, i64 2) + %18 = call @llvm.vector.insert.nxv8f64.nx2f64( %17, %14, i64 4) + %19 = call @llvm.vector.insert.nxv8f64.nx2f64( %18, %15, i64 6) + %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, %11, %19) ret float %call } @@ -102,10 +134,24 @@ entry: %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1( %1, double* %x1) + %2 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x0) + %3 = call {, , } @llvm.aarch64.sve.ld3.sret.nxv2f64( %1, double* %x1) %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) - %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, %2, %3, %4) + %5 = extractvalue { , , , } %2, 0 + %6 = extractvalue { , , , } %2, 1 + %7 = extractvalue { , , , } %2, 2 + %8 = extractvalue { , , , } %2, 3 + %9 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %5, i64 0) + %10 = call @llvm.vector.insert.nxv8f64.nx2f64( %9, %6, i64 2) + %11 = call @llvm.vector.insert.nxv8f64.nx2f64( %10, %7, i64 4) + %12 = call @llvm.vector.insert.nxv8f64.nx2f64( %11, %8, i64 6) + %13 = extractvalue { , , } %3, 0 + %14 = extractvalue { , , } %3, 1 + %15 = extractvalue { , , } %3, 2 + %16 = call @llvm.vector.insert.nxv6f64.nx2f64( poison, %13, i64 0) + %17 = call @llvm.vector.insert.nxv6f64.nx2f64( %16 , %14, i64 2) + %18 = call @llvm.vector.insert.nxv6f64.nx2f64( %17 , %15, i64 4) + %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, %12, %18, %4) ret float %call } @@ -435,9 +481,9 @@ declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() -declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(, double*) -declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(, double*) +declare {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64(, double*) +declare {, , } @llvm.aarch64.sve.ld3.sret.nxv2f64(, double*) declare @llvm.aarch64.sve.ld1.nxv2f64(, double*) declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) -declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(, i32 immarg) -declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(, i32 immarg) +declare @llvm.vector.insert.nxv8f64.nx2f64(, , i64) +declare @llvm.vector.insert.nxv6f64.nx2f64(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll +++ /dev/null @@ -1,539 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme < %s | FileCheck %s - -; NOTE: invalid, upper and lower bound immediate values of the regimm -; addressing mode are checked only for the byte version of each -; instruction (`ldb`), as the code for detecting the immediate is -; common to all instructions, and varies only for the number of -; elements of the structure store, which is = 2, 3, 4. - -; ld2b -define @ld2.nxv32i8( %Pg, *%addr) { -; CHECK-LABEL: ld2.nxv32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 2 -%base_ptr = bitcast * %base to i8* -%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld2.nxv32i8_lower_bound( %Pg, *%addr) { -; CHECK-LABEL: ld2.nxv32i8_lower_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #-16, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -16 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld2.nxv32i8_upper_bound( %Pg, *%addr) { -; CHECK-LABEL: ld2.nxv32i8_upper_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #14, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 14 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld2.nxv32i8_not_multiple_of_2( %Pg, *%addr) { -; CHECK-LABEL: ld2.nxv32i8_not_multiple_of_2: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #3 -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 3 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld2.nxv32i8_outside_lower_bound( %Pg, *%addr) { -; CHECK-LABEL: ld2.nxv32i8_outside_lower_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #-18 -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -18 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld2.nxv32i8_outside_upper_bound( %Pg, *%addr) { -; CHECK-LABEL: ld2.nxv32i8_outside_upper_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #16 -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 16 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -; ld2h -define @ld2.nxv16i16( %Pg, * %addr) { -; CHECK-LABEL: ld2.nxv16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #14, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 14 -%base_ptr = bitcast * %base to i16 * -%res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) -ret %res -} - -define @ld2.nxv16f16( %Pg, * %addr) { -; CHECK-LABEL: ld2.nxv16f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #-16, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -16 -%base_ptr = bitcast * %base to half * -%res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %Pg, half *%base_ptr) -ret %res -} - -define @ld2.nxv16bf16( %Pg, * %addr) #0 { -; CHECK-LABEL: ld2.nxv16bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #12, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 12 -%base_ptr = bitcast * %base to bfloat * -%res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) -ret %res -} - -; ld2w -define @ld2.nxv8i32( %Pg, * %addr) { -; CHECK-LABEL: ld2.nxv8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, #14, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 14 -%base_ptr = bitcast * %base to i32 * -%res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) -ret %res -} - -define @ld2.nxv8f32( %Pg, * %addr) { -; CHECK-LABEL: ld2.nxv8f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, #-16, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -16 -%base_ptr = bitcast * %base to float * -%res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %Pg, float *%base_ptr) -ret %res -} - -; ld2d -define @ld2.nxv4i64( %Pg, * %addr) { -; CHECK-LABEL: ld2.nxv4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, #14, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 14 -%base_ptr = bitcast * %base to i64 * -%res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) -ret %res -} - -define @ld2.nxv4f64( %Pg, * %addr) { -; CHECK-LABEL: ld2.nxv4f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, #-16, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -16 -%base_ptr = bitcast * %base to double * -%res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %Pg, double *%base_ptr) -ret %res -} - -; ld3b -define @ld3.nxv48i8( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv48i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 3 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld3.nxv48i8_lower_bound( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv48i8_lower_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #-24, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -24 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld3.nxv48i8_upper_bound( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv48i8_upper_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #21, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 21 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld3.nxv48i8_not_multiple_of_3_01( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_01: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #4 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 4 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld3.nxv48i8_not_multiple_of_3_02( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_02: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #5 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 5 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld3.nxv48i8_outside_lower_bound( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv48i8_outside_lower_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #-27 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -27 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld3.nxv48i8_outside_upper_bound( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv48i8_outside_upper_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #24 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 24 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -; ld3h -define @ld3.nxv24i16( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv24i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 21 -%base_ptr = bitcast * %base to i16 * -%res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) -ret %res -} - -define @ld3.nxv24f16( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv24f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 21 -%base_ptr = bitcast * %base to half * -%res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %Pg, half *%base_ptr) -ret %res -} - -define @ld3.nxv24bf16( %Pg, *%addr) #0 { -; CHECK-LABEL: ld3.nxv24bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #-24, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -24 -%base_ptr = bitcast * %base to bfloat * -%res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) -ret %res -} - -; ld3w -define @ld3.nxv12i32( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv12i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #21, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 21 -%base_ptr = bitcast * %base to i32 * -%res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) -ret %res -} - -define @ld3.nxv12f32( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv12f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #-24, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -24 -%base_ptr = bitcast * %base to float * -%res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %Pg, float *%base_ptr) -ret %res -} - -; ld3d -define @ld3.nxv6i64( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv6i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #21, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 21 -%base_ptr = bitcast * %base to i64 * -%res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) -ret %res -} - -define @ld3.nxv6f64( %Pg, *%addr) { -; CHECK-LABEL: ld3.nxv6f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #-24, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -24 -%base_ptr = bitcast * %base to double * -%res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %Pg, double *%base_ptr) -ret %res -} - -; ; ld4b -define @ld4.nxv64i8( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #4, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 4 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld4.nxv64i8_lower_bound( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8_lower_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #-32, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -32 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld4.nxv64i8_upper_bound( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8_upper_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #28, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 28 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld4.nxv64i8_not_multiple_of_4_01( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_01: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #5 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 5 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld4.nxv64i8_not_multiple_of_4_02( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_02: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #6 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 6 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld4.nxv64i8_not_multiple_of_4_03( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_03: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #7 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 7 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld4.nxv64i8_outside_lower_bound( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8_outside_lower_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov x9, #-576 -; CHECK-NEXT: lsr x8, x8, #4 -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #4) #9) -; xM = -9 * 2^6 -; xP = RDVL * 2^-4 -; xOFFSET = RDVL * 2^-4 * -9 * 2^6 = RDVL * -36 -%base = getelementptr , * %addr, i64 -36 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -define @ld4.nxv64i8_outside_upper_bound( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv64i8_outside_upper_bound: -; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov w9, #512 -; CHECK-NEXT: lsr x8, x8, #4 -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] -; CHECK-NEXT: ret -; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #16) #2) -; xM = 2^9 -; xP = RDVL * 2^-4 -; xOFFSET = RDVL * 2^-4 * 2^9 = RDVL * 32 -%base = getelementptr , * %addr, i64 32 -%base_ptr = bitcast * %base to i8 * -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) -ret %res -} - -; ld4h -define @ld4.nxv32i16( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv32i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #8, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 8 -%base_ptr = bitcast * %base to i16 * -%res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) -ret %res -} - -define @ld4.nxv32f16( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv32f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #28, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 28 -%base_ptr = bitcast * %base to half * -%res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %Pg, half *%base_ptr) -ret %res -} - -define @ld4.nxv32bf16( %Pg, *%addr) #0 { -; CHECK-LABEL: ld4.nxv32bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #-32, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -32 -%base_ptr = bitcast * %base to bfloat * -%res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) -ret %res -} - -; ld4w -define @ld4.nxv16i32( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #28, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 28 -%base_ptr = bitcast * %base to i32 * -%res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) -ret %res -} - -define @ld4.nxv16f32( %Pg, * %addr) { -; CHECK-LABEL: ld4.nxv16f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #-32, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -32 -%base_ptr = bitcast * %base to float * -%res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %Pg, float *%base_ptr) -ret %res -} - -; ld4d -define @ld4.nxv8i64( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #28, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 28 -%base_ptr = bitcast * %base to i64 * -%res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) -ret %res -} - -define @ld4.nxv8f64( %Pg, *%addr) { -; CHECK-LABEL: ld4.nxv8f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #-32, mul vl] -; CHECK-NEXT: ret -%base = getelementptr , * %addr, i64 -32 -%base_ptr = bitcast * %base to double * -%res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %Pg, double * %base_ptr) -ret %res -} - -declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) - -declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) - -declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) - -; +bf16 is required for the bfloat version. -attributes #0 = { "target-features"="+bf16" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll +++ /dev/null @@ -1,285 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme < %s | FileCheck %s - -; ld2b -define @ld2.nxv32i8( %Pg, i8 *%addr, i64 %a) { -; CHECK-LABEL: ld2.nxv32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x1] -; CHECK-NEXT: ret -%addr2 = getelementptr i8, i8 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%addr2) -ret %res -} - -; ld2h -define @ld2.nxv16i16( %Pg, i16 *%addr, i64 %a) { -; CHECK-LABEL: ld2.nxv16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr i16, i16 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %Pg, i16 *%addr2) -ret %res -} - -define @ld2.nxv16f16( %Pg, half *%addr, i64 %a) { -; CHECK-LABEL: ld2.nxv16f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr half, half * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %Pg, half *%addr2) -ret %res -} - -define @ld2.nxv16bf16( %Pg, bfloat *%addr, i64 %a) #0 { -; CHECK-LABEL: ld2.nxv16bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) -ret %res -} - -; ld2w -define @ld2.nxv8i32( %Pg, i32 *%addr, i64 %a) { -; CHECK-LABEL: ld2.nxv8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: ret -%addr2 = getelementptr i32, i32 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %Pg, i32 *%addr2) -ret %res -} - -define @ld2.nxv8f32( %Pg, float *%addr, i64 %a) { -; CHECK-LABEL: ld2.nxv8f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: ret -%addr2 = getelementptr float, float * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %Pg, float *%addr2) -ret %res -} - -; ld2d -define @ld2.nxv4i64( %Pg, i64 *%addr, i64 %a) { -; CHECK-LABEL: ld2.nxv4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] -; CHECK-NEXT: ret -%addr2 = getelementptr i64, i64 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %Pg, i64 *%addr2) -ret %res -} - -define @ld2.nxv4f64( %Pg, double *%addr, i64 %a) { -; CHECK-LABEL: ld2.nxv4f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] -; CHECK-NEXT: ret -%addr2 = getelementptr double, double * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %Pg, double *%addr2) -ret %res -} - -; ld3b -define @ld3.nxv48i8( %Pg, i8 *%addr, i64 %a) { -; CHECK-LABEL: ld3.nxv48i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x1] -; CHECK-NEXT: ret -%addr2 = getelementptr i8, i8 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%addr2) -ret %res -} - -; ld3h -define @ld3.nxv24i16( %Pg, i16 *%addr, i64 %a) { -; CHECK-LABEL: ld3.nxv24i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr i16, i16 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %Pg, i16 *%addr2) -ret %res -} - -define @ld3.nxv24f16( %Pg, half *%addr, i64 %a) { -; CHECK-LABEL: ld3.nxv24f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr half, half * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %Pg, half *%addr2) -ret %res -} - -define @ld3.nxv24bf16( %Pg, bfloat *%addr, i64 %a) #0 { -; CHECK-LABEL: ld3.nxv24bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) -ret %res -} - -; ld3w -define @ld3.nxv12i32( %Pg, i32 *%addr, i64 %a) { -; CHECK-LABEL: ld3.nxv12i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: ret -%addr2 = getelementptr i32, i32 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %Pg, i32 *%addr2) -ret %res -} - -define @ld3.nxv12f32( %Pg, float *%addr, i64 %a) { -; CHECK-LABEL: ld3.nxv12f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: ret -%addr2 = getelementptr float, float * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %Pg, float *%addr2) -ret %res -} - -; ld3d -define @ld3.nxv6i64( %Pg, i64 *%addr, i64 %a) { -; CHECK-LABEL: ld3.nxv6i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] -; CHECK-NEXT: ret -%addr2 = getelementptr i64, i64 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %Pg, i64 *%addr2) -ret %res -} - -define @ld3.nxv6f64( %Pg, double *%addr, i64 %a) { -; CHECK-LABEL: ld3.nxv6f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] -; CHECK-NEXT: ret -%addr2 = getelementptr double, double * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %Pg, double *%addr2) -ret %res -} - -; ld4b -define @ld4.nxv64i8( %Pg, i8 *%addr, i64 %a) { -; CHECK-LABEL: ld4.nxv64i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x1] -; CHECK-NEXT: ret -%addr2 = getelementptr i8, i8 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%addr2) -ret %res -} - -; ld4h -define @ld4.nxv32i16( %Pg, i16 *%addr, i64 %a) { -; CHECK-LABEL: ld4.nxv32i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr i16, i16 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %Pg, i16 *%addr2) -ret %res -} - -define @ld4.nxv32f16( %Pg, half *%addr, i64 %a) { -; CHECK-LABEL: ld4.nxv32f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr half, half * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %Pg, half *%addr2) -ret %res -} - -define @ld4.nxv32bf16( %Pg, bfloat *%addr, i64 %a) #0 { -; CHECK-LABEL: ld4.nxv32bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: ret -%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) -ret %res -} - -; ld4w -define @ld4.nxv16i32( %Pg, i32 *%addr, i64 %a) { -; CHECK-LABEL: ld4.nxv16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: ret -%addr2 = getelementptr i32, i32 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %Pg, i32 *%addr2) -ret %res -} - -define @ld4.nxv16f32( %Pg, float *%addr, i64 %a) { -; CHECK-LABEL: ld4.nxv16f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: ret -%addr2 = getelementptr float, float * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %Pg, float *%addr2) -ret %res -} - -; ld4d -define @ld4.nxv8i64( %Pg, i64 *%addr, i64 %a) { -; CHECK-LABEL: ld4.nxv8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] -; CHECK-NEXT: ret -%addr2 = getelementptr i64, i64 * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %Pg, i64 *%addr2) -ret %res -} - -define @ld4.nxv8f64( %Pg, double *%addr, i64 %a) { -; CHECK-LABEL: ld4.nxv8f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] -; CHECK-NEXT: ret -%addr2 = getelementptr double, double * %addr, i64 %a -%res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %Pg, double *%addr2) -ret %res -} - -declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) - -declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) - -declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) - -; +bf16 is required for the bfloat version. -attributes #0 = { "target-features"="+bf16" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -603,270 +603,6 @@ ret %res } -; -; LD2B -; - -define @ld2b_i8( %pred, i8* %addr) { -; CHECK-LABEL: ld2b_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %pred, i8* %addr) - ret %res -} - -; -; LD2H -; - -define @ld2h_i16( %pred, i16* %addr) { -; CHECK-LABEL: ld2h_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %pred, i16* %addr) - ret %res -} - -define @ld2h_f16( %pred, half* %addr) { -; CHECK-LABEL: ld2h_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %pred, half* %addr) - ret %res -} - -define @ld2h_bf16( %pred, bfloat* %addr) { -; CHECK-LABEL: ld2h_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) - ret %res -} - -; -; LD2W -; - -define @ld2w_i32( %pred, i32* %addr) { -; CHECK-LABEL: ld2w_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %pred, i32* %addr) - ret %res -} - -define @ld2w_f32( %pred, float* %addr) { -; CHECK-LABEL: ld2w_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %pred, float* %addr) - ret %res -} - -; -; LD2D -; - -define @ld2d_i64( %pred, i64* %addr) { -; CHECK-LABEL: ld2d_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %pred, i64* %addr) - ret %res -} - -define @ld2d_f64( %pred, double* %addr) { -; CHECK-LABEL: ld2d_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %pred, double* %addr) - ret %res -} - -; -; LD3B -; - -define @ld3b_i8( %pred, i8* %addr) { -; CHECK-LABEL: ld3b_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %pred, i8* %addr) - ret %res -} - -; -; LD3H -; - -define @ld3h_i16( %pred, i16* %addr) { -; CHECK-LABEL: ld3h_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %pred, i16* %addr) - ret %res -} - -define @ld3h_f16( %pred, half* %addr) { -; CHECK-LABEL: ld3h_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %pred, half* %addr) - ret %res -} - -define @ld3h_bf16( %pred, bfloat* %addr) { -; CHECK-LABEL: ld3h_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) - ret %res -} - -; -; LD3W -; - -define @ld3w_i32( %pred, i32* %addr) { -; CHECK-LABEL: ld3w_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %pred, i32* %addr) - ret %res -} - -define @ld3w_f32( %pred, float* %addr) { -; CHECK-LABEL: ld3w_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %pred, float* %addr) - ret %res -} - -; -; LD3D -; - -define @ld3d_i64( %pred, i64* %addr) { -; CHECK-LABEL: ld3d_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %pred, i64* %addr) - ret %res -} - -define @ld3d_f64( %pred, double* %addr) { -; CHECK-LABEL: ld3d_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %pred, double* %addr) - ret %res -} - -; -; LD4B -; - -define @ld4b_i8( %pred, i8* %addr) { -; CHECK-LABEL: ld4b_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %pred, i8* %addr) - ret %res -} - -; -; LD4H -; - -define @ld4h_i16( %pred, i16* %addr) { -; CHECK-LABEL: ld4h_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %pred, i16* %addr) - ret %res -} - -define @ld4h_f16( %pred, half* %addr) { -; CHECK-LABEL: ld4h_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %pred, half* %addr) - ret %res -} - -define @ld4h_bf16( %pred, bfloat* %addr) { -; CHECK-LABEL: ld4h_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) - ret %res -} - -; -; LD4W -; - -define @ld4w_i32( %pred, i32* %addr) { -; CHECK-LABEL: ld4w_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %pred, i32* %addr) - ret %res -} - -define @ld4w_f32( %pred, float* %addr) { -; CHECK-LABEL: ld4w_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %pred, float* %addr) - ret %res -} - -; -; LD4D -; - -define @ld4d_i64( %pred, i64* %addr) { -; CHECK-LABEL: ld4d_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %pred, i64* %addr) - ret %res -} - -define @ld4d_f64( %pred, double* %addr) { -; CHECK-LABEL: ld4d_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %pred, double* %addr) - ret %res -} - declare @llvm.aarch64.sve.ld1rq.nxv16i8(, i8*) declare @llvm.aarch64.sve.ld1rq.nxv8i16(, i16*) @@ -886,33 +622,6 @@ declare @llvm.aarch64.sve.ldnt1.nxv4f32(, float*) declare @llvm.aarch64.sve.ldnt1.nxv2f64(, double*) -declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) - -declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) - -declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) -declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(, i16*) -declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) -declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) -declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) -declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) -declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) -declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) - declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) declare @llvm.vector.insert.nxv2f64.v2f64(, <2 x double>, i64) declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) diff --git a/llvm/test/CodeGen/AArch64/sve-merging-stores.ll b/llvm/test/CodeGen/AArch64/sve-merging-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-merging-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-merging-stores.ll @@ -3,10 +3,7 @@ %complex = type { { double, double } } ; Function Attrs: argmemonly nounwind readonly -declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64(, i32 immarg) #3 - -; Function Attrs: argmemonly nounwind readonly -declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1(, double*) #3 +declare { , } @llvm.aarch64.sve.ld2.sret.nxv2f64(, double*) #3 ; Function Attrs: nounwind readnone declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) #2 @@ -20,10 +17,10 @@ ; CHECK-NEXT: str q2, [x0] %realp = getelementptr inbounds %complex, %complex* %outval, i64 0, i32 0, i32 0 %imagp = getelementptr inbounds %complex, %complex* %outval, i64 0, i32 0, i32 1 - %1 = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1( %pred, double* nonnull %inptr) - %2 = call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64( %1, i32 0) + %1 = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( %pred, double* nonnull %inptr) + %2 = extractvalue { , } %1, 0 %3 = call double @llvm.aarch64.sve.faddv.nxv2f64( %pred, %2) - %4 = call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64( %1, i32 1) + %4 = extractvalue { , } %1, 1 %5 = call double @llvm.aarch64.sve.faddv.nxv2f64( %pred, %4) store double %3, double* %realp, align 8 store double %5, double* %imagp, align 8