Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2564,6 +2564,16 @@ return false; } + /// Lower an SVE (AArch64) structured load returning a wide type to target + /// specific intrinsic that splits the wide type and creates a multi-result + /// node. + virtual SDValue lowerStructuredLoad(unsigned Intrinsic, const CallInst &I, + ArrayRef LoadOps, + SelectionDAG &DAG, + const SDLoc &DL) const { + llvm_unreachable("Not Implemented"); + } + /// Return true if zero-extending the specific node Val to type VT2 is free /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or /// because it's folded such as X86 zero-extending loads). Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -802,6 +802,10 @@ : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>; + class AdvSIMD_ManyVec_PredLoad_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + class AdvSIMD_1Vec_PredLoad_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1298,6 +1302,10 @@ // Loads // +def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic; +def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic; +def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic; + def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7033,6 +7033,17 @@ DAG.getZExtOrTrunc(Const, getCurSDLoc(), DestVT))); return; } + case Intrinsic::aarch64_sve_ld2: + case Intrinsic::aarch64_sve_ld3: + case Intrinsic::aarch64_sve_ld4: { + SDValue Chain = getRoot(); + SDValue Mask = getValue(I.getArgOperand(0)); + SDValue BasePtr = getValue(I.getArgOperand(1)); + SDValue LoadOps[] = {Chain, Mask, BasePtr}; + SDValue Result = TLI.lowerStructuredLoad(Intrinsic, I, LoadOps, DAG, sdl); + setValue(&I, Result); + return; + } case Intrinsic::aarch64_sve_tuple_get: { SDValue Src1 = getValue(I.getOperand(0)); SDValue Idx = getValue(I.getOperand(1)); Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -237,6 +237,8 @@ unsigned SubRegIdx); void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc, + unsigned SubRegIdx); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -1387,6 +1389,29 @@ CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, + const unsigned Opc, + unsigned SubRegIdx) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SDValue Ops[] = {N->getOperand(1), // Predicate + N->getOperand(2), // Memory operand + CurDAG->getTargetConstant(0, dl, MVT::i64), Chain}; + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + SDNode *Load = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + SDValue SuperReg = SDValue(Load, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, + VT, SuperReg)); + + ReplaceUses(SDValue(N, NumVecs), SDValue(Load, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -4445,6 +4470,54 @@ } break; } + case AArch64ISD::SVE_LD2: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM, AArch64::zsub0); + return; + } + break; + } + case AArch64ISD::SVE_LD3: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM, AArch64::zsub0); + return; + } + break; + } + case AArch64ISD::SVE_LD4: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM, AArch64::zsub0); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM, AArch64::zsub0); + return; + } + break; + } } // Select the default instruction Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -225,6 +225,11 @@ LDFF1, LDFF1S, + // Structured loads. + SVE_LD2, + SVE_LD3, + SVE_LD4, + // Unsigned gather loads. GLD1, GLD1_SCALED, @@ -463,6 +468,10 @@ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + SDValue lowerStructuredLoad(unsigned Intrinsic, const CallInst &I, + ArrayRef LoadOps, SelectionDAG &DAG, + const SDLoc &DL) const override; + bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1404,6 +1404,9 @@ case AArch64ISD::INSR: return "AArch64ISD::INSR"; case AArch64ISD::PTEST: return "AArch64ISD::PTEST"; case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; + case AArch64ISD::SVE_LD2: return "AArch64ISD::SVE_LD2"; + case AArch64ISD::SVE_LD3: return "AArch64ISD::SVE_LD3"; + case AArch64ISD::SVE_LD4: return "AArch64ISD::SVE_LD4"; case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1"; case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S"; case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1"; @@ -9617,6 +9620,67 @@ return true; } +// Lower an SVE structured load intrinsic returning a tuple type to target +// specific intrinsic taking the same input but returning a multi-result value +// of the split tuple type. +// +// E.g. Lowering an ld3: +// +// call @llvm.aarch64.sve.ld3.nxv12i32( +// %pred, +// * %addr) +// +// Output DAG: +// +// t0: ch = EntryToken +// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 +// t4: i64,ch = CopyFromReg t0, Register:i64 %1 +// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 +// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 +// +// This is called during SelectionDAGBuilder to avoid legalization issues with +// wide irregular types such as nxv12i32 in the example above and other types +// which aren't a power of 2 wide. +SDValue AArch64TargetLowering::lowerStructuredLoad(unsigned Intrinsic, + const CallInst &I, + ArrayRef LoadOps, + SelectionDAG &DAG, + const SDLoc &DL) const { + auto VT = EVT::getEVT(I.getType()); + assert(VT.isScalableVector() && "Can only lower scalable vectors"); + + unsigned N, Opcode; + switch (Intrinsic) { + case Intrinsic::aarch64_sve_ld2: + N = 2; + Opcode = AArch64ISD::SVE_LD2; + break; + case Intrinsic::aarch64_sve_ld3: + N = 3; + Opcode = AArch64ISD::SVE_LD3; + break; + case Intrinsic::aarch64_sve_ld4: + N = 4; + Opcode = AArch64ISD::SVE_LD4; + break; + default: + llvm_unreachable("unhandled intrinsic"); + } + + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() / N); + assert(isTypeLegal(SplitVT)); + + SmallVector VTs(N, SplitVT); + VTs.push_back(MVT::Other); // Chain + SDVTList NodeTys = DAG.getVTList(VTs); + + SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); + SmallVector PseudoLoadOps; + for (unsigned I = 0; I < N; ++I) + PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); +} EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads-with-extract.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads-with-extract.ll @@ -0,0 +1,727 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s + +; +; LD2B +; + +define @ld2b_i8_0( %pred, * %addr) { +; CHECK-LABEL: ld2b_i8_0: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv32i8( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv32i8( %res, i32 0) + ret %v1 +} + +define @ld2b_i8_1( %pred, * %addr) { +; CHECK-LABEL: ld2b_i8_1: +; CHECK: ld2b { z31.b, z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv32i8( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv32i8( %res, i32 1) + ret %v2 +} + +; +; LD2H +; + +define @ld2h_i16_0( %pred, * %addr) { +; CHECK-LABEL: ld2h_i16_0: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16i16( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv16i16( %res, i32 0) + ret %v1 +} + +define @ld2h_i16_1( %pred, * %addr) { +; CHECK-LABEL: ld2h_i16_1: +; CHECK: ld2h { z31.h, z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16i16( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv16i16( %res, i32 1) + ret %v2 +} + +define @ld2h_f16_0( %pred, * %addr) { +; CHECK-LABEL: ld2h_f16_0: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16f16( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv16f16( %res, i32 0) + ret %v1 +} + +define @ld2h_f16_1( %pred, * %addr) { +; CHECK-LABEL: ld2h_f16_1: +; CHECK: ld2h { z31.h, z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16f16( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv16f16( %res, i32 1) + ret %v2 +} + +; +; LD2W +; + +define @ld2w_i32_0( %pred, * %addr) { +; CHECK-LABEL: ld2w_i32_0: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8i32( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv8i32( %res, i32 0) + ret %v1 +} + +define @ld2w_i32_1( %pred, * %addr) { +; CHECK-LABEL: ld2w_i32_1: +; CHECK: ld2w { z31.s, z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8i32( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv8i32( %res, i32 1) + ret %v2 +} + +define @ld2w_f32_0( %pred, * %addr) { +; CHECK-LABEL: ld2w_f32_0: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8f32( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv8f32( %res, i32 0) + ret %v1 +} + +define @ld2w_f32_1( %pred, * %addr) { +; CHECK-LABEL: ld2w_f32_1: +; CHECK: ld2w { z31.s, z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8f32( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv8f32( %res, i32 1) + ret %v2 +} + +; +; LD2D +; + +define @ld2d_i64_0( %pred, * %addr) { +; CHECK-LABEL: ld2d_i64_0: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4i64( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv4i64( %res, i32 0) + ret %v1 +} + +define @ld2d_i64_1( %pred, * %addr) { +; CHECK-LABEL: ld2d_i64_1: +; CHECK: ld2d { z31.d, z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4i64( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv4i64( %res, i32 1) + ret %v2 +} + +define @ld2d_f64_0( %pred, * %addr) { +; CHECK-LABEL: ld2d_f64_0: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4f64( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv4f64( %res, i32 0) + ret %v1 +} + +define @ld2d_f64_1( %pred, * %addr) { +; CHECK-LABEL: ld2d_f64_1: +; CHECK: ld2d { z31.d, z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4f64( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv4f64( %res, i32 1) + ret %v2 +} + +; +; LD3B +; + +define @ld3b_i8_0( %pred, * %addr) { +; CHECK-LABEL: ld3b_i8_0: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv48i8( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv48i8( %res, i32 0); + ret %v1 +} + +define @ld3b_i8_1( %pred, * %addr) { +; CHECK-LABEL: ld3b_i8_1: +; CHECK: ld3b { z31.b, z0.b, z1.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv48i8( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv48i8( %res, i32 1); + ret %v2 +} + +define @ld3b_i8_2( %pred, * %addr) { +; CHECK-LABEL: ld3b_i8_2: +; CHECK: ld3b { z30.b, z31.b, z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv48i8( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv48i8( %res, i32 2); + ret %v3 +} + +; +; LD3H +; + +define @ld3h_i16_0( %pred, * %addr) { +; CHECK-LABEL: ld3h_i16_0: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24i16( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv24i16( %res, i32 0); + ret %v1 +} + +define @ld3h_i16_1( %pred, * %addr) { +; CHECK-LABEL: ld3h_i16_1: +; CHECK: ld3h { z31.h, z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24i16( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv24i16( %res, i32 1); + ret %v2 +} + +define @ld3h_i16_2( %pred, * %addr) { +; CHECK-LABEL: ld3h_i16_2: +; CHECK: ld3h { z30.h, z31.h, z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24i16( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv24i16( %res, i32 2); + ret %v3 +} + +define @ld3h_f16_0( %pred, * %addr) { +; CHECK-LABEL: ld3h_f16_0: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24f16( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv24f16( %res, i32 0); + ret %v1 +} + +define @ld3h_f16_1( %pred, * %addr) { +; CHECK-LABEL: ld3h_f16_1: +; CHECK: ld3h { z31.h, z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24f16( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv24f16( %res, i32 1); + ret %v2 +} + +define @ld3h_f16_2( %pred, * %addr) { +; CHECK-LABEL: ld3h_f16_2: +; CHECK: ld3h { z30.h, z31.h, z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24f16( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv24f16( %res, i32 2); + ret %v3 +} + +; +; LD3W +; + +define @ld3w_i32_0( %pred, * %addr) { +; CHECK-LABEL: ld3w_i32_0: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12i32( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv12i32( %res, i32 0); + ret %v1 +} + +define @ld3w_i32_1( %pred, * %addr) { +; CHECK-LABEL: ld3w_i32_1: +; CHECK: ld3w { z31.s, z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12i32( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv12i32( %res, i32 1); + ret %v2 +} + +define @ld3w_i32_2( %pred, * %addr) { +; CHECK-LABEL: ld3w_i32_2: +; CHECK: ld3w { z30.s, z31.s, z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12i32( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv12i32( %res, i32 2); + ret %v3 +} + +define @ld3w_f32_0( %pred, * %addr) { +; CHECK-LABEL: ld3w_f32_0: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12f32( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv12f32( %res, i32 0); + ret %v1 +} + +define @ld3w_f32_1( %pred, * %addr) { +; CHECK-LABEL: ld3w_f32_1: +; CHECK: ld3w { z31.s, z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12f32( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv12f32( %res, i32 1); + ret %v2 +} + +define @ld3w_f32_2( %pred, * %addr) { +; CHECK-LABEL: ld3w_f32_2: +; CHECK: ld3w { z30.s, z31.s, z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12f32( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv12f32( %res, i32 2); + ret %v3 +} + +; +; LD3D +; + +define @ld3d_i64_0( %pred, * %addr) { +; CHECK-LABEL: ld3d_i64_0: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6i64( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv6i64( %res, i32 0); + ret %v1 +} + +define @ld3d_i64_1( %pred, * %addr) { +; CHECK-LABEL: ld3d_i64_1: +; CHECK: ld3d { z31.d, z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6i64( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv6i64( %res, i32 1); + ret %v2 +} + +define @ld3d_i64_2( %pred, * %addr) { +; CHECK-LABEL: ld3d_i64_2: +; CHECK: ld3d { z30.d, z31.d, z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6i64( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv6i64( %res, i32 2); + ret %v3 +} + +define @ld3d_f64_0( %pred, * %addr) { +; CHECK-LABEL: ld3d_f64_0: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6f64( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv6f64( %res, i32 0); + ret %v1 +} + +define @ld3d_f64_1( %pred, * %addr) { +; CHECK-LABEL: ld3d_f64_1: +; CHECK: ld3d { z31.d, z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6f64( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv6f64( %res, i32 1); + ret %v2 +} + +define @ld3d_f64_2( %pred, * %addr) { +; CHECK-LABEL: ld3d_f64_2: +; CHECK: ld3d { z30.d, z31.d, z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6f64( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv6f64( %res, i32 2); + ret %v3 +} + +; +; LD4B +; + +define @ld4b_i8_0( %pred, * %addr) { +; CHECK-LABEL: ld4b_i8_0: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv64i8( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv64i8( %res, i32 0); + ret %v1 +} + +define @ld4b_i8_1( %pred, * %addr) { +; CHECK-LABEL: ld4b_i8_1: +; CHECK: ld4b { z31.b, z0.b, z1.b, z2.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv64i8( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv64i8( %res, i32 1); + ret %v2 +} + +define @ld4b_i8_2( %pred, * %addr) { +; CHECK-LABEL: ld4b_i8_2: +; CHECK: ld4b { z30.b, z31.b, z0.b, z1.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv64i8( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv64i8( %res, i32 2); + ret %v3 +} + +define @ld4b_i8_3( %pred, * %addr) { +; CHECK-LABEL: ld4b_i8_3: +; CHECK: ld4b { z29.b, z30.b, z31.b, z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv64i8( %pred, + * %addr) + %v4 = call @llvm.aarch64.sve.tuple.get.nxv64i8( %res, i32 3); + ret %v4 +} + +; +; LD4H +; + +define @ld4h_i16_0( %pred, * %addr) { +; CHECK-LABEL: ld4h_i16_0: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32i16( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv32i16( %res, i32 0); + ret %v1 +} + +define @ld4h_i16_1( %pred, * %addr) { +; CHECK-LABEL: ld4h_i16_1: +; CHECK: ld4h { z31.h, z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32i16( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv32i16( %res, i32 1); + ret %v2 +} + +define @ld4h_i16_2( %pred, * %addr) { +; CHECK-LABEL: ld4h_i16_2: +; CHECK: ld4h { z30.h, z31.h, z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32i16( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv32i16( %res, i32 2); + ret %v3 +} + +define @ld4h_i16_3( %pred, * %addr) { +; CHECK-LABEL: ld4h_i16_3: +; CHECK: ld4h { z29.h, z30.h, z31.h, z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32i16( %pred, + * %addr) + %v4 = call @llvm.aarch64.sve.tuple.get.nxv32i16( %res, i32 3); + ret %v4 +} + +define @ld4h_f16_0( %pred, * %addr) { +; CHECK-LABEL: ld4h_f16_0: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32f16( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv32f16( %res, i32 0); + ret %v1 +} + +define @ld4h_f16_1( %pred, * %addr) { +; CHECK-LABEL: ld4h_f16_1: +; CHECK: ld4h { z31.h, z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32f16( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv32f16( %res, i32 1); + ret %v2 +} + +define @ld4h_f16_2( %pred, * %addr) { +; CHECK-LABEL: ld4h_f16_2: +; CHECK: ld4h { z30.h, z31.h, z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32f16( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv32f16( %res, i32 2); + ret %v3 +} + +define @ld4h_f16_3( %pred, * %addr) { +; CHECK-LABEL: ld4h_f16_3: +; CHECK: ld4h { z29.h, z30.h, z31.h, z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32f16( %pred, + * %addr) + %v4 = call @llvm.aarch64.sve.tuple.get.nxv32f16( %res, i32 3); + ret %v4 +} + +; +; LD4W +; + +define @ld4w_i32_0( %pred, * %addr) { +; CHECK-LABEL: ld4w_i32_0: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16i32( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv16i32( %res, i32 0); + ret %v1 +} + +define @ld4w_i32_1( %pred, * %addr) { +; CHECK-LABEL: ld4w_i32_1: +; CHECK: ld4w { z31.s, z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16i32( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv16i32( %res, i32 1); + ret %v2 +} + +define @ld4w_i32_2( %pred, * %addr) { +; CHECK-LABEL: ld4w_i32_2: +; CHECK: ld4w { z30.s, z31.s, z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16i32( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv16i32( %res, i32 2); + ret %v3 +} + +define @ld4w_i32_3( %pred, * %addr) { +; CHECK-LABEL: ld4w_i32_3: +; CHECK: ld4w { z29.s, z30.s, z31.s, z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16i32( %pred, + * %addr) + %v4 = call @llvm.aarch64.sve.tuple.get.nxv16i32( %res, i32 3); + ret %v4 +} + +define @ld4w_f32_0( %pred, * %addr) { +; CHECK-LABEL: ld4w_f32_0: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16f32( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv16f32( %res, i32 0); + ret %v1 +} + +define @ld4w_f32_1( %pred, * %addr) { +; CHECK-LABEL: ld4w_f32_1: +; CHECK: ld4w { z31.s, z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16f32( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv16f32( %res, i32 1); + ret %v2 +} + +define @ld4w_f32_2( %pred, * %addr) { +; CHECK-LABEL: ld4w_f32_2: +; CHECK: ld4w { z30.s, z31.s, z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16f32( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv16f32( %res, i32 2); + ret %v3 +} + +define @ld4w_f32_3( %pred, * %addr) { +; CHECK-LABEL: ld4w_f32_3: +; CHECK: ld4w { z29.s, z30.s, z31.s, z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16f32( %pred, + * %addr) + %v4 = call @llvm.aarch64.sve.tuple.get.nxv16f32( %res, i32 3); + ret %v4 +} + +; +; LD4D +; + +define @ld4d_i64_0( %pred, * %addr) { +; CHECK-LABEL: ld4d_i64_0: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8i64( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv8i64( %res, i32 0); + ret %v1 +} + +define @ld4d_i64_1( %pred, * %addr) { +; CHECK-LABEL: ld4d_i64_1: +; CHECK: ld4d { z31.d, z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8i64( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv8i64( %res, i32 1); + ret %v2 +} + +define @ld4d_i64_2( %pred, * %addr) { +; CHECK-LABEL: ld4d_i64_2: +; CHECK: ld4d { z30.d, z31.d, z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8i64( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv8i64( %res, i32 2); + ret %v3 +} + +define @ld4d_i64_3( %pred, * %addr) { +; CHECK-LABEL: ld4d_i64_3: +; CHECK: ld4d { z29.d, z30.d, z31.d, z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8i64( %pred, + * %addr) + %v4 = call @llvm.aarch64.sve.tuple.get.nxv8i64( %res, i32 3); + ret %v4 +} + +define @ld4d_f64_0( %pred, * %addr) { +; CHECK-LABEL: ld4d_f64_0: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8f64( %pred, + * %addr) + %v1 = call @llvm.aarch64.sve.tuple.get.nxv8f64( %res, i32 0); + ret %v1 +} + +define @ld4d_f64_1( %pred, * %addr) { +; CHECK-LABEL: ld4d_f64_1: +; CHECK: ld4d { z31.d, z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8f64( %pred, + * %addr) + %v2 = call @llvm.aarch64.sve.tuple.get.nxv8f64( %res, i32 1); + ret %v2 +} + +define @ld4d_f64_2( %pred, * %addr) { +; CHECK-LABEL: ld4d_f64_2: +; CHECK: ld4d { z30.d, z31.d, z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8f64( %pred, + * %addr) + %v3 = call @llvm.aarch64.sve.tuple.get.nxv8f64( %res, i32 2); + ret %v3 +} + +define @ld4d_f64_3( %pred, * %addr) { +; CHECK-LABEL: ld4d_f64_3: +; CHECK: ld4d { z29.d, z30.d, z31.d, z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8f64( %pred, + * %addr) + %v4 = call @llvm.aarch64.sve.tuple.get.nxv8f64( %res, i32 3); + ret %v4 +} + +declare @llvm.aarch64.sve.tuple.get.nxv32i8(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv16i16(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv8i32(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv4i64(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv16f16(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv8f32(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv4f64(, i32) + +declare @llvm.aarch64.sve.tuple.get.nxv48i8(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv24i16(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv12i32(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv6i64(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv24f16(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv12f32(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv6f64(, i32) + +declare @llvm.aarch64.sve.tuple.get.nxv64i8(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv32i16(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv16i32(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv8i64(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv32f16(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv16f32(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv8f64(, i32) + +declare @llvm.aarch64.sve.ld2.nxv32i8(, *) +declare @llvm.aarch64.sve.ld2.nxv16i16(, *) +declare @llvm.aarch64.sve.ld2.nxv8i32(, *) +declare @llvm.aarch64.sve.ld2.nxv4i64(, *) +declare @llvm.aarch64.sve.ld2.nxv16f16(, *) +declare @llvm.aarch64.sve.ld2.nxv8f32(, *) +declare @llvm.aarch64.sve.ld2.nxv4f64(, *) + +declare @llvm.aarch64.sve.ld3.nxv48i8(, *) +declare @llvm.aarch64.sve.ld3.nxv24i16(, *) +declare @llvm.aarch64.sve.ld3.nxv12i32(, *) +declare @llvm.aarch64.sve.ld3.nxv6i64(, *) +declare @llvm.aarch64.sve.ld3.nxv24f16(, *) +declare @llvm.aarch64.sve.ld3.nxv12f32(, *) +declare @llvm.aarch64.sve.ld3.nxv6f64(, *) + +declare @llvm.aarch64.sve.ld4.nxv64i8(, *) +declare @llvm.aarch64.sve.ld4.nxv32i16(, *) +declare @llvm.aarch64.sve.ld4.nxv16i32(, *) +declare @llvm.aarch64.sve.ld4.nxv8i64(, *) +declare @llvm.aarch64.sve.ld4.nxv32f16(, *) +declare @llvm.aarch64.sve.ld4.nxv16f32(, *) +declare @llvm.aarch64.sve.ld4.nxv8f64(, *) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s ; ; LDNT1B @@ -79,6 +79,243 @@ ret %res } +; +; LD2B +; + +define @ld2b_i8( %pred, * %addr) { +; CHECK-LABEL: ld2b_i8: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv32i8( %pred, + * %addr) + ret %res +} + +; +; LD2H +; + +define @ld2h_i16( %pred, * %addr) { +; CHECK-LABEL: ld2h_i16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16i16( %pred, + * %addr) + ret %res +} + +define @ld2h_f16( %pred, * %addr) { +; CHECK-LABEL: ld2h_f16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16f16( %pred, + * %addr) + ret %res +} + +; +; LD2W +; + +define @ld2w_i32( %pred, * %addr) { +; CHECK-LABEL: ld2w_i32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8i32( %pred, + * %addr) + ret %res +} + +define @ld2w_f32( %pred, * %addr) { +; CHECK-LABEL: ld2w_f32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8f32( %pred, + * %addr) + ret %res +} + +; +; LD2D +; + +define @ld2d_i64( %pred, * %addr) { +; CHECK-LABEL: ld2d_i64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4i64( %pred, + * %addr) + ret %res +} + +define @ld2d_f64( %pred, * %addr) { +; CHECK-LABEL: ld2d_f64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4f64( %pred, + * %addr) + ret %res +} + +; +; LD3B +; + +define @ld3b_i8( %pred, * %addr) { +; CHECK-LABEL: ld3b_i8: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv48i8( %pred, + * %addr) + ret %res +} + +; +; LD3H +; + +define @ld3h_i16( %pred, * %addr) { +; CHECK-LABEL: ld3h_i16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24i16( %pred, + * %addr) + ret %res +} + +define @ld3h_f16( %pred, * %addr) { +; CHECK-LABEL: ld3h_f16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24f16( %pred, + * %addr) + ret %res +} + +; +; LD3W +; + +define @ld3w_i32( %pred, * %addr) { +; CHECK-LABEL: ld3w_i32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12i32( %pred, + * %addr) + ret %res +} + +define @ld3w_f32( %pred, * %addr) { +; CHECK-LABEL: ld3w_f32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12f32( %pred, + * %addr) + ret %res +} + +; +; LD3D +; + +define @ld3d_i64( %pred, * %addr) { +; CHECK-LABEL: ld3d_i64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6i64( %pred, + * %addr) + ret %res +} + +define @ld3d_f64( %pred, * %addr) { +; CHECK-LABEL: ld3d_f64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6f64( %pred, + * %addr) + ret %res +} + +; +; LD4B +; + +define @ld4b_i8( %pred, * %addr) { +; CHECK-LABEL: ld4b_i8: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv64i8( %pred, + * %addr) + ret %res +} + +; +; LD4H +; + +define @ld4h_i16( %pred, * %addr) { +; CHECK-LABEL: ld4h_i16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32i16( %pred, + * %addr) + ret %res +} + +define @ld4h_f16( %pred, * %addr) { +; CHECK-LABEL: ld4h_f16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32f16( %pred, + * %addr) + ret %res +} + +; +; LD4W +; + +define @ld4w_i32( %pred, * %addr) { +; CHECK-LABEL: ld4w_i32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16i32( %pred, + * %addr) + ret %res +} + +define @ld4w_f32( %pred, * %addr) { +; CHECK-LABEL: ld4w_f32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16f32( %pred, + * %addr) + ret %res +} + +; +; LD4D +; + +define @ld4d_i64( %pred, * %addr) { +; CHECK-LABEL: ld4d_i64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8i64( %pred, + * %addr) + ret %res +} + +define @ld4d_f64( %pred, * %addr) { +; CHECK-LABEL: ld4d_f64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8f64( %pred, + * %addr) + ret %res +} + declare @llvm.aarch64.sve.ldnt1.nxv16i8(, *) declare @llvm.aarch64.sve.ldnt1.nxv8i16(, *) declare @llvm.aarch64.sve.ldnt1.nxv4i32(, *) @@ -86,3 +323,27 @@ declare @llvm.aarch64.sve.ldnt1.nxv8f16(, *) declare @llvm.aarch64.sve.ldnt1.nxv4f32(, *) declare @llvm.aarch64.sve.ldnt1.nxv2f64(, *) + +declare @llvm.aarch64.sve.ld2.nxv32i8(, *) +declare @llvm.aarch64.sve.ld2.nxv16i16(, *) +declare @llvm.aarch64.sve.ld2.nxv8i32(, *) +declare @llvm.aarch64.sve.ld2.nxv4i64(, *) +declare @llvm.aarch64.sve.ld2.nxv16f16(, *) +declare @llvm.aarch64.sve.ld2.nxv8f32(, *) +declare @llvm.aarch64.sve.ld2.nxv4f64(, *) + +declare @llvm.aarch64.sve.ld3.nxv48i8(, *) +declare @llvm.aarch64.sve.ld3.nxv24i16(, *) +declare @llvm.aarch64.sve.ld3.nxv12i32(, *) +declare @llvm.aarch64.sve.ld3.nxv6i64(, *) +declare @llvm.aarch64.sve.ld3.nxv24f16(, *) +declare @llvm.aarch64.sve.ld3.nxv12f32(, *) +declare @llvm.aarch64.sve.ld3.nxv6f64(, *) + +declare @llvm.aarch64.sve.ld4.nxv64i8(, *) +declare @llvm.aarch64.sve.ld4.nxv32i16(, *) +declare @llvm.aarch64.sve.ld4.nxv16i32(, *) +declare @llvm.aarch64.sve.ld4.nxv8i64(, *) +declare @llvm.aarch64.sve.ld4.nxv32f16(, *) +declare @llvm.aarch64.sve.ld4.nxv16f32(, *) +declare @llvm.aarch64.sve.ld4.nxv8f64(, *)