diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -172,6 +172,14 @@ bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); + + /// Helper function to select vector load intrinsics like + /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. + /// \p Opc is the opcode that the selected instruction should use. + /// \p NumVecs is the number of vector destinations for the instruction. + /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. + bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, + MachineInstr &I); bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); @@ -5050,6 +5058,35 @@ return IntrinOp->getIntrinsicID(); } +bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, + unsigned NumVecs, + MachineInstr &I) { + assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); + assert(Opc && "Expected an opcode?"); + assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); + auto &MRI = *MIB.getMRI(); + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Size = Ty.getSizeInBits(); + assert((Size == 64 || Size == 128) && + "Destination must be 64 bits or 128 bits?"); + unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; + auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); + assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); + auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); + Load.cloneMemRefs(I); + constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); + Register SelectedLoadDst = Load->getOperand(0).getReg(); + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) + .addReg(SelectedLoadDst, 0, SubReg + Idx); + // Emit the subreg copies and immediately select them. + // FIXME: We should refactor our copy code into an emitCopy helper and + // clean up uses of this pattern elsewhere in the selector. + selectCopy(*Vec, TII, MRI, TRI, RBI); + } + return true; +} + bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineRegisterInfo &MRI) { // Find the intrinsic ID. @@ -5087,10 +5124,7 @@ .addImm(I.getOperand(1).getImm() | ('U' << 8)); break; case Intrinsic::aarch64_neon_ld2: { - Register Dst1 = I.getOperand(0).getReg(); - Register Dst2 = I.getOperand(1).getReg(); - Register Ptr = I.getOperand(3).getReg(); - LLT Ty = MRI.getType(Dst1); + LLT Ty = MRI.getType(I.getOperand(0).getReg()); unsigned Opc = 0; if (Ty == LLT::fixed_vector(8, S8)) Opc = AArch64::LD2Twov8b; @@ -5110,23 +5144,31 @@ Opc = AArch64::LD1Twov1d; else llvm_unreachable("Unexpected type for ld2!"); - unsigned SubReg = - Ty.getSizeInBits() == 64 ? AArch64::dsub0 : AArch64::qsub0; - // This will be selected as a load into a wide register, which is broken - // into two vectors subregister copies. - auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); - Load.cloneMemRefs(I); - constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); - Register SelectedLoadDst = Load->getOperand(0).getReg(); - // Emit the subreg copies and immediately select them. - // FIXME: We should refactor our copy code into an emitCopy helper and - // clean up uses of this pattern elsewhere in the selector. - auto Vec1 = MIB.buildInstr(TargetOpcode::COPY, {Dst1}, {}) - .addReg(SelectedLoadDst, 0, SubReg); - auto Vec2 = MIB.buildInstr(AArch64::COPY, {Dst2}, {}) - .addReg(SelectedLoadDst, 0, SubReg + 1); - selectCopy(*Vec1, TII, MRI, TRI, RBI); - selectCopy(*Vec2, TII, MRI, TRI, RBI); + selectVectorLoadIntrinsic(Opc, 2, I); + break; + } + case Intrinsic::aarch64_neon_ld4: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD4Fourv8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD4Fourv16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD4Fourv4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD4Fourv8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD4Fourv2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD4Fourv4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD4Fourv2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD1Fourv1d; + else + llvm_unreachable("Unexpected type for ld4!"); + selectVectorLoadIntrinsic(Opc, 4, I); break; } case Intrinsic::aarch64_neon_st2: { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-ld4.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-ld4.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-ld4.mir @@ -0,0 +1,292 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +... +--- +name: LD4Fourv8b +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD4Fourv8b + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv8b:%[0-9]+]]:dddd = LD4Fourv8b %ptr :: (load (<8 x s64>)) + ; CHECK: %dst1:fpr64 = COPY [[LD4Fourv8b]].dsub0 + ; CHECK: %dst2:fpr64 = COPY [[LD4Fourv8b]].dsub1 + ; CHECK: %dst3:fpr64 = COPY [[LD4Fourv8b]].dsub2 + ; CHECK: %dst4:fpr64 = COPY [[LD4Fourv8b]].dsub3 + ; CHECK: $d0 = COPY %dst1 + ; CHECK: $d1 = COPY %dst2 + ; CHECK: $d2 = COPY %dst3 + ; CHECK: $d3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<8 x s8>), %dst2:fpr(<8 x s8>), %dst3:fpr(<8 x s8>), %dst4:fpr(<8 x s8>)= G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<8 x s64>)) + $d0 = COPY %dst1(<8 x s8>) + $d1 = COPY %dst2(<8 x s8>) + $d2 = COPY %dst3(<8 x s8>) + $d3 = COPY %dst4(<8 x s8>) + RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 +... +--- +name: LD4Fourv16b +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD4Fourv16b + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv16b:%[0-9]+]]:qqqq = LD4Fourv16b %ptr :: (load (<16 x s64>)) + ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv16b]].qsub0 + ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv16b]].qsub1 + ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv16b]].qsub2 + ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv16b]].qsub3 + ; CHECK: $q0 = COPY %dst1 + ; CHECK: $q1 = COPY %dst2 + ; CHECK: $q2 = COPY %dst3 + ; CHECK: $q3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<16 x s8>), %dst2:fpr(<16 x s8>), %dst3:fpr(<16 x s8>), %dst4:fpr(<16 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<16 x s64>)) + $q0 = COPY %dst1(<16 x s8>) + $q1 = COPY %dst2(<16 x s8>) + $q2 = COPY %dst3(<16 x s8>) + $q3 = COPY %dst4(<16 x s8>) + RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 +... +--- +name: LD4Fourv4h +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: LD4Fourv4h + ; CHECK: liveins: $x0 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv4h:%[0-9]+]]:dddd = LD4Fourv4h %ptr :: (load (<4 x s64>)) + ; CHECK: %dst1:fpr64 = COPY [[LD4Fourv4h]].dsub0 + ; CHECK: %dst2:fpr64 = COPY [[LD4Fourv4h]].dsub1 + ; CHECK: %dst3:fpr64 = COPY [[LD4Fourv4h]].dsub2 + ; CHECK: %dst4:fpr64 = COPY [[LD4Fourv4h]].dsub3 + ; CHECK: $d0 = COPY %dst1 + ; CHECK: $d1 = COPY %dst2 + ; CHECK: $d2 = COPY %dst3 + ; CHECK: $d3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<4 x s16>), %dst2:fpr(<4 x s16>), %dst3:fpr(<4 x s16>), %dst4:fpr(<4 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<4 x s64>)) + $d0 = COPY %dst1(<4 x s16>) + $d1 = COPY %dst2(<4 x s16>) + $d2 = COPY %dst3(<4 x s16>) + $d3 = COPY %dst4(<4 x s16>) + RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 +... +--- +name: LD4Fourv8h +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD4Fourv8h + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv8h:%[0-9]+]]:qqqq = LD4Fourv8h %ptr :: (load (<8 x s64>)) + ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv8h]].qsub0 + ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv8h]].qsub1 + ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv8h]].qsub2 + ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv8h]].qsub3 + ; CHECK: $q0 = COPY %dst1 + ; CHECK: $q1 = COPY %dst2 + ; CHECK: $q2 = COPY %dst3 + ; CHECK: $q3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<8 x s16>), %dst2:fpr(<8 x s16>), %dst3:fpr(<8 x s16>), %dst4:fpr(<8 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<8 x s64>)) + $q0 = COPY %dst1(<8 x s16>) + $q1 = COPY %dst2(<8 x s16>) + $q2 = COPY %dst3(<8 x s16>) + $q3 = COPY %dst4(<8 x s16>) + RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 +... +--- +name: LD4Fourv2s +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD4Fourv2s + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv2s:%[0-9]+]]:dddd = LD4Fourv2s %ptr :: (load (<2 x s64>)) + ; CHECK: %dst1:fpr64 = COPY [[LD4Fourv2s]].dsub0 + ; CHECK: %dst2:fpr64 = COPY [[LD4Fourv2s]].dsub1 + ; CHECK: %dst3:fpr64 = COPY [[LD4Fourv2s]].dsub2 + ; CHECK: %dst4:fpr64 = COPY [[LD4Fourv2s]].dsub3 + ; CHECK: $d0 = COPY %dst1 + ; CHECK: $d1 = COPY %dst2 + ; CHECK: $d2 = COPY %dst3 + ; CHECK: $d3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<2 x s32>), %dst2:fpr(<2 x s32>), %dst3:fpr(<2 x s32>), %dst4:fpr(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<2 x s64>)) + $d0 = COPY %dst1(<2 x s32>) + $d1 = COPY %dst2(<2 x s32>) + $d2 = COPY %dst3(<2 x s32>) + $d3 = COPY %dst4(<2 x s32>) + RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 +... +--- +name: LD4Fourv4s +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD4Fourv4s + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv4s:%[0-9]+]]:qqqq = LD4Fourv4s %ptr :: (load (<4 x s64>)) + ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv4s]].qsub0 + ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv4s]].qsub1 + ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv4s]].qsub2 + ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv4s]].qsub3 + ; CHECK: $q0 = COPY %dst1 + ; CHECK: $q1 = COPY %dst2 + ; CHECK: $q2 = COPY %dst3 + ; CHECK: $q3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<4 x s32>), %dst2:fpr(<4 x s32>), %dst3:fpr(<4 x s32>), %dst4:fpr(<4 x s32>)= G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<4 x s64>)) + $q0 = COPY %dst1(<4 x s32>) + $q1 = COPY %dst2(<4 x s32>) + $q2 = COPY %dst3(<4 x s32>) + $q3 = COPY %dst4(<4 x s32>) + RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 +... +--- +name: LD4Fourv2d_v2s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD4Fourv2d_v2s64 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv2d:%[0-9]+]]:qqqq = LD4Fourv2d %ptr :: (load (<2 x s64>)) + ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv2d]].qsub0 + ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv2d]].qsub1 + ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv2d]].qsub2 + ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv2d]].qsub3 + ; CHECK: $q0 = COPY %dst1 + ; CHECK: $q1 = COPY %dst2 + ; CHECK: $q2 = COPY %dst3 + ; CHECK: $q3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<2 x s64>), %dst2:fpr(<2 x s64>), %dst3:fpr(<2 x s64>), %dst4:fpr(<2 x s64>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<2 x s64>)) + $q0 = COPY %dst1(<2 x s64>) + $q1 = COPY %dst2(<2 x s64>) + $q2 = COPY %dst3(<2 x s64>) + $q3 = COPY %dst4(<2 x s64>) + RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 +... +--- +name: LD4Fourv2d_v2p0 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD4Fourv2d_v2p0 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD4Fourv2d:%[0-9]+]]:qqqq = LD4Fourv2d %ptr :: (load (<2 x p0>)) + ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv2d]].qsub0 + ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv2d]].qsub1 + ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv2d]].qsub2 + ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv2d]].qsub3 + ; CHECK: $q0 = COPY %dst1 + ; CHECK: $q1 = COPY %dst2 + ; CHECK: $q2 = COPY %dst3 + ; CHECK: $q3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(<2 x p0>), %dst2:fpr(<2 x p0>), %dst3:fpr(<2 x p0>), %dst4:fpr(<2 x p0>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<2 x p0>)) + $q0 = COPY %dst1(<2 x p0>) + $q1 = COPY %dst2(<2 x p0>) + $q2 = COPY %dst3(<2 x p0>) + $q3 = COPY %dst4(<2 x p0>) + RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 +... +--- +name: LD1Fourv1d_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD1Fourv1d_s64 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD1Fourv1d:%[0-9]+]]:dddd = LD1Fourv1d %ptr :: (load (s64)) + ; CHECK: %dst1:fpr64 = COPY [[LD1Fourv1d]].dsub0 + ; CHECK: %dst2:fpr64 = COPY [[LD1Fourv1d]].dsub1 + ; CHECK: %dst3:fpr64 = COPY [[LD1Fourv1d]].dsub2 + ; CHECK: %dst4:fpr64 = COPY [[LD1Fourv1d]].dsub3 + ; CHECK: $d0 = COPY %dst1 + ; CHECK: $d1 = COPY %dst2 + ; CHECK: $d2 = COPY %dst3 + ; CHECK: $d3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(s64), %dst2:fpr(s64), %dst3:fpr(s64), %dst4:fpr(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (s64)) + $d0 = COPY %dst1(s64) + $d1 = COPY %dst2(s64) + $d2 = COPY %dst3(s64) + $d3 = COPY %dst4(s64) + RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 +... +--- +name: LD1Fourv1d_p0 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: LD1Fourv1d_p0 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: [[LD1Fourv1d:%[0-9]+]]:dddd = LD1Fourv1d %ptr :: (load (p0)) + ; CHECK: %dst1:fpr64 = COPY [[LD1Fourv1d]].dsub0 + ; CHECK: %dst2:fpr64 = COPY [[LD1Fourv1d]].dsub1 + ; CHECK: %dst3:fpr64 = COPY [[LD1Fourv1d]].dsub2 + ; CHECK: %dst4:fpr64 = COPY [[LD1Fourv1d]].dsub3 + ; CHECK: $d0 = COPY %dst1 + ; CHECK: $d1 = COPY %dst2 + ; CHECK: $d2 = COPY %dst3 + ; CHECK: $d3 = COPY %dst4 + ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3 + %ptr:gpr(p0) = COPY $x0 + %dst1:fpr(p0), %dst2:fpr(p0), %dst3:fpr(p0), %dst4:fpr(p0) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (p0)) + $d0 = COPY %dst1(p0) + $d1 = COPY %dst2(p0) + $d2 = COPY %dst3(p0) + $d3 = COPY %dst4(p0) + RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3