diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -971,10 +971,6 @@ : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, ImmArg>]>; - class AdvSIMD_ManyVec_PredLoad_Intrinsic - : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMPointerToElt<0>], - [IntrReadMem, IntrArgMemOnly]>; - class AdvSIMD_1Vec_PredLoad_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1569,10 +1565,6 @@ def int_aarch64_sve_ld1 : AdvSIMD_1Vec_PredLoad_Intrinsic; -def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic; -def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic; -def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic; - def int_aarch64_sve_ld2_sret : AdvSIMD_2Vec_PredLoad_Intrinsic; def int_aarch64_sve_ld3_sret : AdvSIMD_3Vec_PredLoad_Intrinsic; def int_aarch64_sve_ld4_sret : AdvSIMD_4Vec_PredLoad_Intrinsic; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -558,6 +558,22 @@ F->arg_begin()->getType()); return true; } + static const Regex ldRegex("^aarch64\\.sve\\.ld[234]\\.nxv[a-z0-9]+"); + if (ldRegex.match(Name)) { + Type *ScalarTy = + dyn_cast(F->getReturnType())->getElementType(); + ElementCount EC = + dyn_cast(F->arg_begin()->getType())->getElementCount(); + Type *Ty = VectorType::get(ScalarTy, EC); + Intrinsic::ID ID = + StringSwitch(Name) + .StartsWith("aarch64.sve.ld2", Intrinsic::aarch64_sve_ld2_sret) + .StartsWith("aarch64.sve.ld3", Intrinsic::aarch64_sve_ld3_sret) + .StartsWith("aarch64.sve.ld4", Intrinsic::aarch64_sve_ld4_sret) + .Default(Intrinsic::not_intrinsic); + NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Ty); + return true; + } if (Name.startswith("arm.neon.vclz")) { Type* args[2] = { F->arg_begin()->getType(), @@ -3858,7 +3874,30 @@ NewCall = Builder.CreateCall(NewFn, Args); break; } - + case Intrinsic::aarch64_sve_ld3_sret: + case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld2_sret: { + StringRef Name = F->getName(); + Name = Name.substr(5); + unsigned N = StringSwitch(Name) + .StartsWith("aarch64.sve.ld2", 2) + .StartsWith("aarch64.sve.ld3", 3) + .StartsWith("aarch64.sve.ld4", 4) + .Default(Intrinsic::not_intrinsic); + ScalableVectorType *RetTy = + dyn_cast(F->getReturnType()); + unsigned MinElts = RetTy->getMinNumElements() / N; + SmallVector Args(CI->args()); + Value *NewLdCall = Builder.CreateCall(NewFn, Args); + Value *Ret = llvm::PoisonValue::get(RetTy); + for (unsigned I = 0; I < N; I++) { + Value *Idx = ConstantInt::get(Type::getInt64Ty(C), I * MinElts); + Value *SRet = Builder.CreateExtractValue(NewLdCall, 0); + Ret = Builder.CreateInsertVector(RetTy, Ret, SRet, Idx); + } + NewCall = dyn_cast(Ret); + break; + } case Intrinsic::arm_neon_bfdot: case Intrinsic::arm_neon_bfmmla: case Intrinsic::arm_neon_bfmlalb: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1026,8 +1026,6 @@ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; - SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef LoadOps, - EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13486,61 +13486,6 @@ return true; } -// Lower an SVE structured load intrinsic returning a tuple type to target -// specific intrinsic taking the same input but returning a multi-result value -// of the split tuple type. -// -// E.g. Lowering an LD3: -// -// call @llvm.aarch64.sve.ld3.nxv12i32( -// %pred, -// * %addr) -// -// Output DAG: -// -// t0: ch = EntryToken -// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 -// t4: i64,ch = CopyFromReg t0, Register:i64 %1 -// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 -// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 -// -// This is called pre-legalization to avoid widening/splitting issues with -// non-power-of-2 tuple types used for LD3, such as nxv12i32. -SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, - ArrayRef LoadOps, - EVT VT, SelectionDAG &DAG, - const SDLoc &DL) const { - assert(VT.isScalableVector() && "Can only lower scalable vectors"); - - unsigned N, Opcode; - static const std::pair> - IntrinsicMap[] = { - {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, - {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, - {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; - - std::tie(N, Opcode) = llvm::find_if(IntrinsicMap, [&](auto P) { - return P.first == Intrinsic; - })->second; - assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 && - "invalid tuple vector type!"); - - EVT SplitVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorElementCount().divideCoefficientBy(N)); - assert(isTypeLegal(SplitVT)); - - SmallVector VTs(N, SplitVT); - VTs.push_back(MVT::Other); // Chain - SDVTList NodeTys = DAG.getVTList(VTs); - - SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); - SmallVector PseudoLoadOps; - for (unsigned I = 0; I < N; ++I) - PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); -} - EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); @@ -20046,20 +19991,6 @@ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); return DAG.getMergeValues({Concat, Chain}, DL); } - case Intrinsic::aarch64_sve_ld2: - case Intrinsic::aarch64_sve_ld3: - case Intrinsic::aarch64_sve_ld4: { - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Mask = N->getOperand(2); - SDValue BasePtr = N->getOperand(3); - SDValue LoadOps[] = {Chain, Mask, BasePtr}; - unsigned IntrinsicID = - cast(N->getOperand(1))->getZExtValue(); - SDValue Result = - LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); - return DAG.getMergeValues({Result, Chain}, DL); - } case Intrinsic::aarch64_rndr: case Intrinsic::aarch64_rndrrs: { unsigned IntrinsicID = diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -29,10 +29,26 @@ entry: %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) + %2 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x0) + %3 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x1) %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) - %call = call float @callee1(float 1.000000e+00, %2, %3, %4) + %5 = extractvalue { , , , } %2, 0 + %6 = extractvalue { , , , } %2, 1 + %7 = extractvalue { , , , } %2, 2 + %8 = extractvalue { , , , } %2, 3 + %9 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %5, i64 0) + %10 = call @llvm.vector.insert.nxv8f64.nx2f64( %9, %6, i64 2) + %11 = call @llvm.vector.insert.nxv8f64.nx2f64( %10, %7, i64 4) + %12 = call @llvm.vector.insert.nxv8f64.nx2f64( %11, %8, i64 6) + %13 = extractvalue { , , , } %3, 0 + %14 = extractvalue { , , , } %3, 1 + %15 = extractvalue { , , , } %3, 2 + %16 = extractvalue { , , , } %3, 3 + %17 = call @llvm.vector.insert.nxv8f64.nx2f64( undef, %13, i64 0) + %18 = call @llvm.vector.insert.nxv8f64.nx2f64( %17, %14, i64 2) + %19 = call @llvm.vector.insert.nxv8f64.nx2f64( %18, %15, i64 4) + %20 = call @llvm.vector.insert.nxv8f64.nx2f64( %19, %16, i64 6) + %call = call float @callee1(float 1.000000e+00, %12, %20, %4) ret float %call } @@ -73,9 +89,25 @@ entry: %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) - %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, %2, %3) + %2 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x0) + %3 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x1) + %4 = extractvalue { , , , } %2, 0 + %5 = extractvalue { , , , } %2, 1 + %6 = extractvalue { , , , } %2, 2 + %7 = extractvalue { , , , } %2, 3 + %8 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %4, i64 0) + %9 = call @llvm.vector.insert.nxv8f64.nx2f64( %8, %5, i64 2) + %10 = call @llvm.vector.insert.nxv8f64.nx2f64( %9, %6, i64 4) + %11 = call @llvm.vector.insert.nxv8f64.nx2f64( %10, %7, i64 6) + %12 = extractvalue { , , , } %3, 0 + %13 = extractvalue { , , , } %3, 1 + %14 = extractvalue { , , , } %3, 2 + %15 = extractvalue { , , , } %3, 3 + %16 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %12, i64 0) + %17 = call @llvm.vector.insert.nxv8f64.nx2f64( %16, %13, i64 2) + %18 = call @llvm.vector.insert.nxv8f64.nx2f64( %17, %14, i64 4) + %19 = call @llvm.vector.insert.nxv8f64.nx2f64( %18, %15, i64 6) + %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, %11, %19) ret float %call } @@ -102,10 +134,24 @@ entry: %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1( %1, double* %x1) + %2 = call {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( %1, double* %x0) + %3 = call {, , } @llvm.aarch64.sve.ld3.sret.nxv2f64( %1, double* %x1) %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) - %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, %2, %3, %4) + %5 = extractvalue { , , , } %2, 0 + %6 = extractvalue { , , , } %2, 1 + %7 = extractvalue { , , , } %2, 2 + %8 = extractvalue { , , , } %2, 3 + %9 = call @llvm.vector.insert.nxv8f64.nx2f64( poison, %5, i64 0) + %10 = call @llvm.vector.insert.nxv8f64.nx2f64( %9, %6, i64 2) + %11 = call @llvm.vector.insert.nxv8f64.nx2f64( %10, %7, i64 4) + %12 = call @llvm.vector.insert.nxv8f64.nx2f64( %11, %8, i64 6) + %13 = extractvalue { , , } %3, 0 + %14 = extractvalue { , , } %3, 1 + %15 = extractvalue { , , } %3, 2 + %16 = call @llvm.vector.insert.nxv6f64.nx2f64( poison, %13, i64 0) + %17 = call @llvm.vector.insert.nxv6f64.nx2f64( %16 , %14, i64 2) + %18 = call @llvm.vector.insert.nxv6f64.nx2f64( %17 , %15, i64 4) + %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, %12, %18, %4) ret float %call } @@ -381,9 +427,9 @@ declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() -declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(, double*) -declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(, double*) +declare {, , , } @llvm.aarch64.sve.ld4.sret.nxv2f64(, double*) +declare {, , } @llvm.aarch64.sve.ld3.sret.nxv2f64(, double*) declare @llvm.aarch64.sve.ld1.nxv2f64(, double*) declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) -declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(, i32 immarg) -declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(, i32 immarg) +declare @llvm.vector.insert.nxv8f64.nx2f64(, , i64) +declare @llvm.vector.insert.nxv6f64.nx2f64(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll @@ -12,7 +12,9 @@ define @ld2.nxv32i8( %Pg, *%addr) { ; CHECK-LABEL: ld2.nxv32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 2 %base_ptr = bitcast * %base to i8* @@ -23,7 +25,9 @@ define @ld2.nxv32i8_lower_bound( %Pg, *%addr) { ; CHECK-LABEL: ld2.nxv32i8_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: rdvl x8, #-16 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -16 %base_ptr = bitcast * %base to i8 * @@ -34,7 +38,9 @@ define @ld2.nxv32i8_upper_bound( %Pg, *%addr) { ; CHECK-LABEL: ld2.nxv32i8_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: rdvl x8, #14 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 14 %base_ptr = bitcast * %base to i8 * @@ -47,6 +53,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #3 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 3 %base_ptr = bitcast * %base to i8 * @@ -59,6 +66,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #-18 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -18 %base_ptr = bitcast * %base to i8 * @@ -71,6 +79,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #16 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 16 %base_ptr = bitcast * %base to i8 * @@ -82,7 +91,9 @@ define @ld2.nxv16i16( %Pg, * %addr) { ; CHECK-LABEL: ld2.nxv16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: addvl x8, x0, #14 +; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 14 %base_ptr = bitcast * %base to i16 * @@ -93,7 +104,9 @@ define @ld2.nxv16f16( %Pg, * %addr) { ; CHECK-LABEL: ld2.nxv16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: addvl x8, x0, #-16 +; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -16 %base_ptr = bitcast * %base to half * @@ -104,7 +117,9 @@ define @ld2.nxv16bf16( %Pg, * %addr) #0 { ; CHECK-LABEL: ld2.nxv16bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #12, mul vl] +; CHECK-NEXT: addvl x8, x0, #12 +; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 12 %base_ptr = bitcast * %base to bfloat * @@ -116,7 +131,9 @@ define @ld2.nxv8i32( %Pg, * %addr) { ; CHECK-LABEL: ld2.nxv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: addvl x8, x0, #14 +; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 14 %base_ptr = bitcast * %base to i32 * @@ -127,7 +144,9 @@ define @ld2.nxv8f32( %Pg, * %addr) { ; CHECK-LABEL: ld2.nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: addvl x8, x0, #-16 +; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -16 %base_ptr = bitcast * %base to float * @@ -139,7 +158,9 @@ define @ld2.nxv4i64( %Pg, * %addr) { ; CHECK-LABEL: ld2.nxv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: addvl x8, x0, #14 +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 14 %base_ptr = bitcast * %base to i64 * @@ -150,7 +171,9 @@ define @ld2.nxv4f64( %Pg, * %addr) { ; CHECK-LABEL: ld2.nxv4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: addvl x8, x0, #-16 +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x8] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -16 %base_ptr = bitcast * %base to double * @@ -162,7 +185,10 @@ define @ld3.nxv48i8( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv48i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: rdvl x8, #3 +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 3 %base_ptr = bitcast * %base to i8 * @@ -173,7 +199,10 @@ define @ld3.nxv48i8_lower_bound( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv48i8_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: rdvl x8, #-24 +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -24 %base_ptr = bitcast * %base to i8 * @@ -184,7 +213,10 @@ define @ld3.nxv48i8_upper_bound( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv48i8_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: rdvl x8, #21 +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 21 %base_ptr = bitcast * %base to i8 * @@ -196,7 +228,9 @@ ; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_01: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #4 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 4 %base_ptr = bitcast * %base to i8 * @@ -208,7 +242,9 @@ ; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_02: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #5 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 5 %base_ptr = bitcast * %base to i8 * @@ -220,7 +256,9 @@ ; CHECK-LABEL: ld3.nxv48i8_outside_lower_bound: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #-27 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -27 %base_ptr = bitcast * %base to i8 * @@ -232,7 +270,9 @@ ; CHECK-LABEL: ld3.nxv48i8_outside_upper_bound: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #24 -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 24 %base_ptr = bitcast * %base to i8 * @@ -244,7 +284,10 @@ define @ld3.nxv24i16( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv24i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: addvl x8, x0, #21 +; CHECK-NEXT: ld3h { z1.h, z2.h, z3.h }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 21 %base_ptr = bitcast * %base to i16 * @@ -255,7 +298,10 @@ define @ld3.nxv24f16( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv24f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: addvl x8, x0, #21 +; CHECK-NEXT: ld3h { z1.h, z2.h, z3.h }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 21 %base_ptr = bitcast * %base to half * @@ -266,7 +312,10 @@ define @ld3.nxv24bf16( %Pg, *%addr) #0 { ; CHECK-LABEL: ld3.nxv24bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: addvl x8, x0, #-24 +; CHECK-NEXT: ld3h { z1.h, z2.h, z3.h }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -24 %base_ptr = bitcast * %base to bfloat * @@ -278,7 +327,10 @@ define @ld3.nxv12i32( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv12i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: addvl x8, x0, #21 +; CHECK-NEXT: ld3w { z1.s, z2.s, z3.s }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 21 %base_ptr = bitcast * %base to i32 * @@ -289,7 +341,10 @@ define @ld3.nxv12f32( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv12f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: addvl x8, x0, #-24 +; CHECK-NEXT: ld3w { z1.s, z2.s, z3.s }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -24 %base_ptr = bitcast * %base to float * @@ -301,7 +356,10 @@ define @ld3.nxv6i64( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv6i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: addvl x8, x0, #21 +; CHECK-NEXT: ld3d { z1.d, z2.d, z3.d }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 21 %base_ptr = bitcast * %base to i64 * @@ -312,7 +370,10 @@ define @ld3.nxv6f64( %Pg, *%addr) { ; CHECK-LABEL: ld3.nxv6f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: addvl x8, x0, #-24 +; CHECK-NEXT: ld3d { z1.d, z2.d, z3.d }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -24 %base_ptr = bitcast * %base to double * @@ -324,7 +385,11 @@ define @ld4.nxv64i8( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: rdvl x8, #4 +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 4 %base_ptr = bitcast * %base to i8 * @@ -335,7 +400,11 @@ define @ld4.nxv64i8_lower_bound( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv64i8_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: rdvl x8, #-32 +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -32 %base_ptr = bitcast * %base to i8 * @@ -346,7 +415,11 @@ define @ld4.nxv64i8_upper_bound( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv64i8_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: rdvl x8, #28 +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 28 %base_ptr = bitcast * %base to i8 * @@ -358,7 +431,10 @@ ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_01: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #5 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 5 %base_ptr = bitcast * %base to i8 * @@ -370,7 +446,10 @@ ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_02: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #6 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 6 %base_ptr = bitcast * %base to i8 * @@ -382,7 +461,10 @@ ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_03: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #7 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 7 %base_ptr = bitcast * %base to i8 * @@ -397,7 +479,10 @@ ; CHECK-NEXT: mov x9, #-576 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret ; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #4) #9) ; xM = -9 * 2^6 @@ -416,7 +501,10 @@ ; CHECK-NEXT: mov w9, #512 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret ; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #16) #2) ; xM = 2^9 @@ -432,7 +520,11 @@ define @ld4.nxv32i16( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #8, mul vl] +; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: ld4h { z2.h, z3.h, z4.h, z5.h }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 8 %base_ptr = bitcast * %base to i16 * @@ -443,7 +535,11 @@ define @ld4.nxv32f16( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: addvl x8, x0, #28 +; CHECK-NEXT: ld4h { z2.h, z3.h, z4.h, z5.h }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 28 %base_ptr = bitcast * %base to half * @@ -454,7 +550,11 @@ define @ld4.nxv32bf16( %Pg, *%addr) #0 { ; CHECK-LABEL: ld4.nxv32bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: addvl x8, x0, #-32 +; CHECK-NEXT: ld4h { z2.h, z3.h, z4.h, z5.h }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -32 %base_ptr = bitcast * %base to bfloat * @@ -466,7 +566,11 @@ define @ld4.nxv16i32( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: addvl x8, x0, #28 +; CHECK-NEXT: ld4w { z2.s, z3.s, z4.s, z5.s }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 28 %base_ptr = bitcast * %base to i32 * @@ -477,7 +581,11 @@ define @ld4.nxv16f32( %Pg, * %addr) { ; CHECK-LABEL: ld4.nxv16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: addvl x8, x0, #-32 +; CHECK-NEXT: ld4w { z2.s, z3.s, z4.s, z5.s }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -32 %base_ptr = bitcast * %base to float * @@ -489,7 +597,11 @@ define @ld4.nxv8i64( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: addvl x8, x0, #28 +; CHECK-NEXT: ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 28 %base_ptr = bitcast * %base to i64 * @@ -500,7 +612,11 @@ define @ld4.nxv8f64( %Pg, *%addr) { ; CHECK-LABEL: ld4.nxv8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: addvl x8, x0, #-32 +; CHECK-NEXT: ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x8] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %base = getelementptr , * %addr, i64 -32 %base_ptr = bitcast * %base to double * diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll @@ -7,6 +7,7 @@ ; CHECK-LABEL: ld2.nxv32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x1] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr i8, i8 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%addr2) @@ -18,6 +19,7 @@ ; CHECK-LABEL: ld2.nxv16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr i16, i16 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %Pg, i16 *%addr2) @@ -28,6 +30,7 @@ ; CHECK-LABEL: ld2.nxv16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr half, half * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %Pg, half *%addr2) @@ -38,6 +41,7 @@ ; CHECK-LABEL: ld2.nxv16bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr bfloat, bfloat * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) @@ -49,6 +53,7 @@ ; CHECK-LABEL: ld2.nxv8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr i32, i32 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %Pg, i32 *%addr2) @@ -59,6 +64,7 @@ ; CHECK-LABEL: ld2.nxv8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr float, float * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %Pg, float *%addr2) @@ -70,6 +76,7 @@ ; CHECK-LABEL: ld2.nxv4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr i64, i64 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %Pg, i64 *%addr2) @@ -80,6 +87,7 @@ ; CHECK-LABEL: ld2.nxv4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ret %addr2 = getelementptr double, double * %addr, i64 %a %res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %Pg, double *%addr2) @@ -90,7 +98,9 @@ define @ld3.nxv48i8( %Pg, i8 *%addr, i64 %a) { ; CHECK-LABEL: ld3.nxv48i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x1] +; CHECK-NEXT: ld3b { z1.b, z2.b, z3.b }, p0/z, [x0, x1] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr i8, i8 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%addr2) @@ -101,7 +111,9 @@ define @ld3.nxv24i16( %Pg, i16 *%addr, i64 %a) { ; CHECK-LABEL: ld3.nxv24i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ld3h { z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr i16, i16 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %Pg, i16 *%addr2) @@ -111,7 +123,9 @@ define @ld3.nxv24f16( %Pg, half *%addr, i64 %a) { ; CHECK-LABEL: ld3.nxv24f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ld3h { z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr half, half * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %Pg, half *%addr2) @@ -121,7 +135,9 @@ define @ld3.nxv24bf16( %Pg, bfloat *%addr, i64 %a) #0 { ; CHECK-LABEL: ld3.nxv24bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ld3h { z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr bfloat, bfloat * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) @@ -132,7 +148,9 @@ define @ld3.nxv12i32( %Pg, i32 *%addr, i64 %a) { ; CHECK-LABEL: ld3.nxv12i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ld3w { z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr i32, i32 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %Pg, i32 *%addr2) @@ -142,7 +160,9 @@ define @ld3.nxv12f32( %Pg, float *%addr, i64 %a) { ; CHECK-LABEL: ld3.nxv12f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ld3w { z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr float, float * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %Pg, float *%addr2) @@ -153,7 +173,9 @@ define @ld3.nxv6i64( %Pg, i64 *%addr, i64 %a) { ; CHECK-LABEL: ld3.nxv6i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ld3d { z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr i64, i64 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %Pg, i64 *%addr2) @@ -163,7 +185,9 @@ define @ld3.nxv6f64( %Pg, double *%addr, i64 %a) { ; CHECK-LABEL: ld3.nxv6f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ld3d { z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ret %addr2 = getelementptr double, double * %addr, i64 %a %res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %Pg, double *%addr2) @@ -174,7 +198,10 @@ define @ld4.nxv64i8( %Pg, i8 *%addr, i64 %a) { ; CHECK-LABEL: ld4.nxv64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x1] +; CHECK-NEXT: ld4b { z2.b, z3.b, z4.b, z5.b }, p0/z, [x0, x1] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr i8, i8 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%addr2) @@ -185,7 +212,10 @@ define @ld4.nxv32i16( %Pg, i16 *%addr, i64 %a) { ; CHECK-LABEL: ld4.nxv32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ld4h { z2.h, z3.h, z4.h, z5.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr i16, i16 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %Pg, i16 *%addr2) @@ -195,7 +225,10 @@ define @ld4.nxv32f16( %Pg, half *%addr, i64 %a) { ; CHECK-LABEL: ld4.nxv32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ld4h { z2.h, z3.h, z4.h, z5.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr half, half * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %Pg, half *%addr2) @@ -205,7 +238,10 @@ define @ld4.nxv32bf16( %Pg, bfloat *%addr, i64 %a) #0 { ; CHECK-LABEL: ld4.nxv32bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ld4h { z2.h, z3.h, z4.h, z5.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr bfloat, bfloat * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) @@ -216,7 +252,10 @@ define @ld4.nxv16i32( %Pg, i32 *%addr, i64 %a) { ; CHECK-LABEL: ld4.nxv16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ld4w { z2.s, z3.s, z4.s, z5.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr i32, i32 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %Pg, i32 *%addr2) @@ -226,7 +265,10 @@ define @ld4.nxv16f32( %Pg, float *%addr, i64 %a) { ; CHECK-LABEL: ld4.nxv16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ld4w { z2.s, z3.s, z4.s, z5.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr float, float * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %Pg, float *%addr2) @@ -237,7 +279,10 @@ define @ld4.nxv8i64( %Pg, i64 *%addr, i64 %a) { ; CHECK-LABEL: ld4.nxv8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr i64, i64 * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %Pg, i64 *%addr2) @@ -247,7 +292,10 @@ define @ld4.nxv8f64( %Pg, double *%addr, i64 %a) { ; CHECK-LABEL: ld4.nxv8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: ret %addr2 = getelementptr double, double * %addr, i64 %a %res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %Pg, double *%addr2) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -603,270 +603,6 @@ ret %res } -; -; LD2B -; - -define @ld2b_i8( %pred, i8* %addr) { -; CHECK-LABEL: ld2b_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %pred, i8* %addr) - ret %res -} - -; -; LD2H -; - -define @ld2h_i16( %pred, i16* %addr) { -; CHECK-LABEL: ld2h_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %pred, i16* %addr) - ret %res -} - -define @ld2h_f16( %pred, half* %addr) { -; CHECK-LABEL: ld2h_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %pred, half* %addr) - ret %res -} - -define @ld2h_bf16( %pred, bfloat* %addr) { -; CHECK-LABEL: ld2h_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) - ret %res -} - -; -; LD2W -; - -define @ld2w_i32( %pred, i32* %addr) { -; CHECK-LABEL: ld2w_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %pred, i32* %addr) - ret %res -} - -define @ld2w_f32( %pred, float* %addr) { -; CHECK-LABEL: ld2w_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %pred, float* %addr) - ret %res -} - -; -; LD2D -; - -define @ld2d_i64( %pred, i64* %addr) { -; CHECK-LABEL: ld2d_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %pred, i64* %addr) - ret %res -} - -define @ld2d_f64( %pred, double* %addr) { -; CHECK-LABEL: ld2d_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %pred, double* %addr) - ret %res -} - -; -; LD3B -; - -define @ld3b_i8( %pred, i8* %addr) { -; CHECK-LABEL: ld3b_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %pred, i8* %addr) - ret %res -} - -; -; LD3H -; - -define @ld3h_i16( %pred, i16* %addr) { -; CHECK-LABEL: ld3h_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %pred, i16* %addr) - ret %res -} - -define @ld3h_f16( %pred, half* %addr) { -; CHECK-LABEL: ld3h_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %pred, half* %addr) - ret %res -} - -define @ld3h_bf16( %pred, bfloat* %addr) { -; CHECK-LABEL: ld3h_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) - ret %res -} - -; -; LD3W -; - -define @ld3w_i32( %pred, i32* %addr) { -; CHECK-LABEL: ld3w_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %pred, i32* %addr) - ret %res -} - -define @ld3w_f32( %pred, float* %addr) { -; CHECK-LABEL: ld3w_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %pred, float* %addr) - ret %res -} - -; -; LD3D -; - -define @ld3d_i64( %pred, i64* %addr) { -; CHECK-LABEL: ld3d_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %pred, i64* %addr) - ret %res -} - -define @ld3d_f64( %pred, double* %addr) { -; CHECK-LABEL: ld3d_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %pred, double* %addr) - ret %res -} - -; -; LD4B -; - -define @ld4b_i8( %pred, i8* %addr) { -; CHECK-LABEL: ld4b_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %pred, i8* %addr) - ret %res -} - -; -; LD4H -; - -define @ld4h_i16( %pred, i16* %addr) { -; CHECK-LABEL: ld4h_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %pred, i16* %addr) - ret %res -} - -define @ld4h_f16( %pred, half* %addr) { -; CHECK-LABEL: ld4h_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %pred, half* %addr) - ret %res -} - -define @ld4h_bf16( %pred, bfloat* %addr) { -; CHECK-LABEL: ld4h_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) - ret %res -} - -; -; LD4W -; - -define @ld4w_i32( %pred, i32* %addr) { -; CHECK-LABEL: ld4w_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %pred, i32* %addr) - ret %res -} - -define @ld4w_f32( %pred, float* %addr) { -; CHECK-LABEL: ld4w_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %pred, float* %addr) - ret %res -} - -; -; LD4D -; - -define @ld4d_i64( %pred, i64* %addr) { -; CHECK-LABEL: ld4d_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %pred, i64* %addr) - ret %res -} - -define @ld4d_f64( %pred, double* %addr) { -; CHECK-LABEL: ld4d_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] -; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %pred, double* %addr) - ret %res -} - declare @llvm.aarch64.sve.ld1rq.nxv16i8(, i8*) declare @llvm.aarch64.sve.ld1rq.nxv8i16(, i16*) diff --git a/llvm/test/CodeGen/AArch64/sve-merging-stores.ll b/llvm/test/CodeGen/AArch64/sve-merging-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-merging-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-merging-stores.ll @@ -3,10 +3,7 @@ %complex = type { { double, double } } ; Function Attrs: argmemonly nounwind readonly -declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64(, i32 immarg) #3 - -; Function Attrs: argmemonly nounwind readonly -declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1(, double*) #3 +declare { , } @llvm.aarch64.sve.ld2.sret.nxv2f64(, double*) #3 ; Function Attrs: nounwind readnone declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) #2 @@ -20,10 +17,10 @@ ; CHECK-NEXT: str q2, [x0] %realp = getelementptr inbounds %complex, %complex* %outval, i64 0, i32 0, i32 0 %imagp = getelementptr inbounds %complex, %complex* %outval, i64 0, i32 0, i32 1 - %1 = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1( %pred, double* nonnull %inptr) - %2 = call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64( %1, i32 0) + %1 = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( %pred, double* nonnull %inptr) + %2 = extractvalue { , } %1, 0 %3 = call double @llvm.aarch64.sve.faddv.nxv2f64( %pred, %2) - %4 = call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv4f64( %1, i32 1) + %4 = extractvalue { , } %1, 1 %5 = call double @llvm.aarch64.sve.faddv.nxv2f64( %pred, %4) store double %3, double* %realp, align 8 store double %5, double* %imagp, align 8