Index: include/llvm/IR/IntrinsicsARM.td =================================================================== --- include/llvm/IR/IntrinsicsARM.td +++ include/llvm/IR/IntrinsicsARM.td @@ -620,6 +620,18 @@ [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; +def int_arm_neon_vld1x2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMAnyPointerType>], + [IntrReadMem, IntrArgMemOnly]>; +def int_arm_neon_vld1x3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>], + [LLVMAnyPointerType>], + [IntrReadMem, IntrArgMemOnly]>; +def int_arm_neon_vld1x4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMAnyPointerType>], + [IntrReadMem, IntrArgMemOnly]>; + // Vector load N-element structure to one lane. // Source operands are: the address, the N input vectors (since only one // lane is assigned), the lane number, and the alignment. Index: lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.cpp +++ lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1341,7 +1341,13 @@ } break; case ARM::VLD1q64: + case ARM::VLD1d8TPseudo: + case ARM::VLD1d16TPseudo: + case ARM::VLD1d32TPseudo: case ARM::VLD1d64TPseudo: + case ARM::VLD1d8QPseudo: + case ARM::VLD1d16QPseudo: + case ARM::VLD1d32QPseudo: case ARM::VLD1d64QPseudo: if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) { FrameIndex = MI.getOperand(1).getIndex(); @@ -4212,6 +4218,9 @@ case ARM::VLD3d8Pseudo: case ARM::VLD3d16Pseudo: case ARM::VLD3d32Pseudo: + case ARM::VLD1d8TPseudo: + case ARM::VLD1d16TPseudo: + case ARM::VLD1d32TPseudo: case ARM::VLD1d64TPseudo: case ARM::VLD1d64TPseudoWB_fixed: case ARM::VLD1d64TPseudoWB_register: @@ -4230,9 +4239,28 @@ case ARM::VLD4d8Pseudo: case ARM::VLD4d16Pseudo: case ARM::VLD4d32Pseudo: + case ARM::VLD1d8QPseudo: + case ARM::VLD1d16QPseudo: + case ARM::VLD1d32QPseudo: case ARM::VLD1d64QPseudo: case ARM::VLD1d64QPseudoWB_fixed: case ARM::VLD1d64QPseudoWB_register: + case ARM::VLD1q8HighQPseudo: + case ARM::VLD1q8LowQPseudo_UPD: + case ARM::VLD1q8HighTPseudo: + case ARM::VLD1q8LowTPseudo_UPD: + case ARM::VLD1q16HighQPseudo: + case ARM::VLD1q16LowQPseudo_UPD: + case ARM::VLD1q16HighTPseudo: + case ARM::VLD1q16LowTPseudo_UPD: + case ARM::VLD1q32HighQPseudo: + case ARM::VLD1q32LowQPseudo_UPD: + case ARM::VLD1q32HighTPseudo: + case ARM::VLD1q32LowTPseudo_UPD: + case ARM::VLD1q64HighQPseudo: + case ARM::VLD1q64LowQPseudo_UPD: + case ARM::VLD1q64HighTPseudo: + case ARM::VLD1q64LowTPseudo_UPD: case ARM::VLD4d8Pseudo_UPD: case ARM::VLD4d16Pseudo_UPD: case ARM::VLD4d32Pseudo_UPD: Index: lib/Target/ARM/ARMExpandPseudoInsts.cpp =================================================================== --- lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -110,6 +110,9 @@ // OddDblSpc depending on the lane number operand. enum NEONRegSpacing { SingleSpc, + SingleLowSpc , // Single spacing, low registers, three and four vectors. + SingleHighQSpc, // Single spacing, high registers, four vectors. + SingleHighTSpc, // Single spacing, high registers, three vectors. EvenDblSpc, OddDblSpc }; @@ -154,12 +157,34 @@ { ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, false, EvenDblSpc, 1, 8 ,true}, { ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true}, +{ ARM::VLD1d16QPseudo, ARM::VLD1d16Q, true, false, false, SingleSpc, 4, 1 ,false}, +{ ARM::VLD1d16TPseudo, ARM::VLD1d16T, true, false, false, SingleSpc, 3, 4 ,false}, +{ ARM::VLD1d32QPseudo, ARM::VLD1d32Q, true, false, false, SingleSpc, 4, 1 ,false}, +{ ARM::VLD1d32TPseudo, ARM::VLD1d32T, true, false, false, SingleSpc, 3, 2 ,false}, { ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64QPseudoWB_register, ARM::VLD1d64Qwb_register, true, true, true, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false}, { ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false}, { ARM::VLD1d64TPseudoWB_register, ARM::VLD1d64Twb_register, true, true, true, SingleSpc, 3, 1 ,false}, +{ ARM::VLD1d8QPseudo, ARM::VLD1d8Q, true, false, false, SingleSpc, 4, 1 ,false}, +{ ARM::VLD1d8TPseudo, ARM::VLD1d8T, true, false, false, SingleSpc, 3, 8 ,false}, +{ ARM::VLD1q16HighQPseudo, ARM::VLD1d16Q, true, false, false, SingleHighQSpc, 4, 4 ,false}, +{ ARM::VLD1q16HighTPseudo, ARM::VLD1d16T, true, false, false, SingleHighTSpc, 3, 4 ,false}, +{ ARM::VLD1q16LowQPseudo_UPD, ARM::VLD1d16Qwb_fixed, true, true, true, SingleLowSpc, 4, 4 ,false}, +{ ARM::VLD1q16LowTPseudo_UPD, ARM::VLD1d16Twb_fixed, true, true, true, SingleLowSpc, 3, 4 ,false}, +{ ARM::VLD1q32HighQPseudo, ARM::VLD1d32Q, true, false, false, SingleHighQSpc, 4, 2 ,false}, +{ ARM::VLD1q32HighTPseudo, ARM::VLD1d32T, true, false, false, SingleHighTSpc, 3, 4 ,false}, +{ ARM::VLD1q32LowQPseudo_UPD, ARM::VLD1d32Qwb_fixed, true, true, true, SingleLowSpc, 4, 2 ,false}, +{ ARM::VLD1q32LowTPseudo_UPD, ARM::VLD1d32Twb_fixed, true, true, true, SingleLowSpc, 3, 4 ,false}, +{ ARM::VLD1q64HighQPseudo, ARM::VLD1d64Q, true, false, false, SingleHighQSpc, 4, 1 ,false}, +{ ARM::VLD1q64HighTPseudo, ARM::VLD1d64T, true, false, false, SingleHighTSpc, 3, 1 ,false}, +{ ARM::VLD1q64LowQPseudo_UPD, ARM::VLD1d64Qwb_fixed, true, true, true, SingleLowSpc, 4, 1 ,false}, +{ ARM::VLD1q64LowTPseudo_UPD, ARM::VLD1d64Twb_fixed, true, true, true, SingleLowSpc, 3, 1 ,false}, +{ ARM::VLD1q8HighQPseudo, ARM::VLD1d8Q, true, false, false, SingleHighQSpc, 4, 8 ,false}, +{ ARM::VLD1q8HighTPseudo, ARM::VLD1d8T, true, false, false, SingleHighTSpc, 3, 8 ,false}, +{ ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false}, +{ ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false}, { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, @@ -370,11 +395,21 @@ static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, const TargetRegisterInfo *TRI, unsigned &D0, unsigned &D1, unsigned &D2, unsigned &D3) { - if (RegSpc == SingleSpc) { + if (RegSpc == SingleSpc || RegSpc == SingleLowSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_0); D1 = TRI->getSubReg(Reg, ARM::dsub_1); D2 = TRI->getSubReg(Reg, ARM::dsub_2); D3 = TRI->getSubReg(Reg, ARM::dsub_3); + } else if (RegSpc == SingleHighQSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_4); + D1 = TRI->getSubReg(Reg, ARM::dsub_5); + D2 = TRI->getSubReg(Reg, ARM::dsub_6); + D3 = TRI->getSubReg(Reg, ARM::dsub_7); + } else if (RegSpc == SingleHighTSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_3); + D1 = TRI->getSubReg(Reg, ARM::dsub_4); + D2 = TRI->getSubReg(Reg, ARM::dsub_5); + D3 = TRI->getSubReg(Reg, ARM::dsub_6); } else if (RegSpc == EvenDblSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_0); D1 = TRI->getSubReg(Reg, ARM::dsub_2); @@ -422,15 +457,40 @@ // Copy the addrmode6 operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. - if (TableEntry->hasWritebackOperand) - MIB.add(MI.getOperand(OpIdx++)); + if (TableEntry->hasWritebackOperand) { + // TODO: The writing-back pseudo instructions we translate here are all + // defined to take am6offset nodes that are capable to represent both fixed + // and register forms. Some real instructions, however, do not rely on + // am6offset and have separate definitions for such forms. When this is the + // case, fixed forms do not take any offset nodes, so here we skip them for + // such intructions. Once all real and pseudo writing-back instructions are + // rewritten without use of am6offset nodes, this code will go away. + const MachineOperand &AM6Offset = MI.getOperand(OpIdx++); + if (TableEntry->RealOpc == ARM::VLD1d8Qwb_fixed || + TableEntry->RealOpc == ARM::VLD1d16Qwb_fixed || + TableEntry->RealOpc == ARM::VLD1d32Qwb_fixed || + TableEntry->RealOpc == ARM::VLD1d64Qwb_fixed || + TableEntry->RealOpc == ARM::VLD1d8Twb_fixed || + TableEntry->RealOpc == ARM::VLD1d16Twb_fixed || + TableEntry->RealOpc == ARM::VLD1d32Twb_fixed || + TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) { + assert(AM6Offset.getReg() == 0 && + "A fixed writing-back pseudo intruction provides an offset " + "register!"); + } else { + MIB.add(AM6Offset); + } + } // For an instruction writing double-spaced subregs, the pseudo instruction // has an extra operand that is a use of the super-register. Record the // operand index and skip over it. unsigned SrcOpIdx = 0; - if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc) + if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || + RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || + RegSpc == SingleHighTSpc) SrcOpIdx = OpIdx++; // Copy the predicate operands. @@ -1503,6 +1563,9 @@ case ARM::VLD3d8Pseudo: case ARM::VLD3d16Pseudo: case ARM::VLD3d32Pseudo: + case ARM::VLD1d8TPseudo: + case ARM::VLD1d16TPseudo: + case ARM::VLD1d32TPseudo: case ARM::VLD1d64TPseudo: case ARM::VLD1d64TPseudoWB_fixed: case ARM::VLD1d64TPseudoWB_register: @@ -1521,9 +1584,28 @@ case ARM::VLD4d8Pseudo: case ARM::VLD4d16Pseudo: case ARM::VLD4d32Pseudo: + case ARM::VLD1d8QPseudo: + case ARM::VLD1d16QPseudo: + case ARM::VLD1d32QPseudo: case ARM::VLD1d64QPseudo: case ARM::VLD1d64QPseudoWB_fixed: case ARM::VLD1d64QPseudoWB_register: + case ARM::VLD1q8HighQPseudo: + case ARM::VLD1q8LowQPseudo_UPD: + case ARM::VLD1q8HighTPseudo: + case ARM::VLD1q8LowTPseudo_UPD: + case ARM::VLD1q16HighQPseudo: + case ARM::VLD1q16LowQPseudo_UPD: + case ARM::VLD1q16HighTPseudo: + case ARM::VLD1q16LowTPseudo_UPD: + case ARM::VLD1q32HighQPseudo: + case ARM::VLD1q32LowQPseudo_UPD: + case ARM::VLD1q32HighTPseudo: + case ARM::VLD1q32LowTPseudo_UPD: + case ARM::VLD1q64HighQPseudo: + case ARM::VLD1q64LowQPseudo_UPD: + case ARM::VLD1q64HighTPseudo: + case ARM::VLD1q64LowTPseudo_UPD: case ARM::VLD4d8Pseudo_UPD: case ARM::VLD4d16Pseudo_UPD: case ARM::VLD4d32Pseudo_UPD: Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1761,9 +1761,7 @@ case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; case MVT::v2f64: - case MVT::v2i64: OpcodeIndex = 3; - assert(NumVecs == 1 && "v2i64 type only supported for VLD1"); - break; + case MVT::v2i64: OpcodeIndex = 3; break; } EVT ResTy; @@ -3441,6 +3439,51 @@ return; } + case Intrinsic::arm_neon_vld1x2: { + static const uint16_t DOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, + ARM::VLD1q32, ARM::VLD1q64 }; + static const uint16_t QOpcodes[] = { ARM::VLD1d8QPseudo, + ARM::VLD1d16QPseudo, + ARM::VLD1d32QPseudo, + ARM::VLD1d64QPseudo }; + SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr); + return; + } + + case Intrinsic::arm_neon_vld1x3: { + static const uint16_t DOpcodes[] = { ARM::VLD1d8TPseudo, + ARM::VLD1d16TPseudo, + ARM::VLD1d32TPseudo, + ARM::VLD1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowTPseudo_UPD, + ARM::VLD1q16LowTPseudo_UPD, + ARM::VLD1q32LowTPseudo_UPD, + ARM::VLD1q64LowTPseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighTPseudo, + ARM::VLD1q16HighTPseudo, + ARM::VLD1q32HighTPseudo, + ARM::VLD1q64HighTPseudo }; + SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + + case Intrinsic::arm_neon_vld1x4: { + static const uint16_t DOpcodes[] = { ARM::VLD1d8QPseudo, + ARM::VLD1d16QPseudo, + ARM::VLD1d32QPseudo, + ARM::VLD1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowQPseudo_UPD, + ARM::VLD1q16LowQPseudo_UPD, + ARM::VLD1q32LowQPseudo_UPD, + ARM::VLD1q64LowQPseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighQPseudo, + ARM::VLD1q16HighQPseudo, + ARM::VLD1q32HighQPseudo, + ARM::VLD1q64HighQPseudo }; + SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + case Intrinsic::arm_neon_vld2: { static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, ARM::VLD2d32, ARM::VLD1q64 }; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -12787,6 +12787,9 @@ case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld1x2: + case Intrinsic::arm_neon_vld1x3: + case Intrinsic::arm_neon_vld1x4: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: @@ -14094,6 +14097,21 @@ Info.flags = MachineMemOperand::MOLoad; return true; } + case Intrinsic::arm_neon_vld1x2: + case Intrinsic::arm_neon_vld1x3: + case Intrinsic::arm_neon_vld1x4: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); + Info.offset = 0; + Info.align = 0; + // volatile loads with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOLoad; + return true; + } case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -770,10 +770,22 @@ defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>; defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>; +def VLD1d8TPseudo : VLDQQPseudo, Sched<[WriteVLD3]>; +def VLD1d16TPseudo : VLDQQPseudo, Sched<[WriteVLD3]>; +def VLD1d32TPseudo : VLDQQPseudo, Sched<[WriteVLD3]>; def VLD1d64TPseudo : VLDQQPseudo, Sched<[WriteVLD3]>; def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD3]>; def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo, Sched<[WriteVLD3]>; +def VLD1q8HighTPseudo : VLDQQQQPseudo, Sched<[WriteVLD3]>; +def VLD1q8LowTPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; +def VLD1q16HighTPseudo : VLDQQQQPseudo, Sched<[WriteVLD3]>; +def VLD1q16LowTPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; +def VLD1q32HighTPseudo : VLDQQQQPseudo, Sched<[WriteVLD3]>; +def VLD1q32LowTPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; +def VLD1q64HighTPseudo : VLDQQQQPseudo, Sched<[WriteVLD3]>; +def VLD1q64LowTPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; + // ...with 4 registers class VLD1D4 op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd), @@ -811,10 +823,22 @@ defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; +def VLD1d8QPseudo : VLDQQPseudo, Sched<[WriteVLD4]>; +def VLD1d16QPseudo : VLDQQPseudo, Sched<[WriteVLD4]>; +def VLD1d32QPseudo : VLDQQPseudo, Sched<[WriteVLD4]>; def VLD1d64QPseudo : VLDQQPseudo, Sched<[WriteVLD4]>; def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD4]>; def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo, Sched<[WriteVLD4]>; +def VLD1q8LowQPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD1q8HighQPseudo : VLDQQQQPseudo, Sched<[WriteVLD4]>; +def VLD1q16LowQPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD1q16HighQPseudo : VLDQQQQPseudo, Sched<[WriteVLD4]>; +def VLD1q32LowQPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD1q32HighQPseudo : VLDQQQQPseudo, Sched<[WriteVLD4]>; +def VLD1q64LowQPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD1q64HighQPseudo : VLDQQQQPseudo, Sched<[WriteVLD4]>; + // VLD2 : Vector Load (multiple 2-element structures) class VLD2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> Index: test/CodeGen/ARM/arm-vld1.ll =================================================================== --- test/CodeGen/ARM/arm-vld1.ll +++ test/CodeGen/ARM/arm-vld1.ll @@ -0,0 +1,242 @@ +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \ +; RUN: -asm-verbose=false | FileCheck %s + +%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> } +%struct.uint16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } +%struct.uint16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } + +%struct.uint32x2x2_t = type { <2 x i32>, <2 x i32> } +%struct.uint32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } +%struct.uint32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } + +%struct.uint64x1x2_t = type { <1 x i64>, <1 x i64> } +%struct.uint64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> } +%struct.uint64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } + +%struct.uint8x8x2_t = type { <8 x i8>, <8 x i8> } +%struct.uint8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } +%struct.uint8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } + +%struct.uint16x8x2_t = type { <8 x i16>, <8 x i16> } +%struct.uint16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } +%struct.uint16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } + +%struct.uint32x4x2_t = type { <4 x i32>, <4 x i32> } +%struct.uint32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } +%struct.uint32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } + +%struct.uint64x2x2_t = type { <2 x i64>, <2 x i64> } +%struct.uint64x2x3_t = type { <2 x i64>, <2 x i64>, <2 x i64> } +%struct.uint64x2x4_t = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } + +%struct.uint8x16x2_t = type { <16 x i8>, <16 x i8> } +%struct.uint8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> } +%struct.uint8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } + +declare %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0i16(i16*) nounwind readonly +declare %struct.uint16x4x3_t @llvm.arm.neon.vld1x3.v4i16.p0i16(i16*) nounwind readonly +declare %struct.uint16x4x4_t @llvm.arm.neon.vld1x4.v4i16.p0i16(i16*) nounwind readonly + +declare %struct.uint32x2x2_t @llvm.arm.neon.vld1x2.v2i32.p0i32(i32*) nounwind readonly +declare %struct.uint32x2x3_t @llvm.arm.neon.vld1x3.v2i32.p0i32(i32*) nounwind readonly +declare %struct.uint32x2x4_t @llvm.arm.neon.vld1x4.v2i32.p0i32(i32*) nounwind readonly + +declare %struct.uint64x1x2_t @llvm.arm.neon.vld1x2.v1i64.p0i64(i64*) nounwind readonly +declare %struct.uint64x1x3_t @llvm.arm.neon.vld1x3.v1i64.p0i64(i64*) nounwind readonly +declare %struct.uint64x1x4_t @llvm.arm.neon.vld1x4.v1i64.p0i64(i64*) nounwind readonly + +declare %struct.uint8x8x2_t @llvm.arm.neon.vld1x2.v8i8.p0i8(i8*) nounwind readonly +declare %struct.uint8x8x3_t @llvm.arm.neon.vld1x3.v8i8.p0i8(i8*) nounwind readonly +declare %struct.uint8x8x4_t @llvm.arm.neon.vld1x4.v8i8.p0i8(i8*) nounwind readonly + +declare %struct.uint16x8x2_t @llvm.arm.neon.vld1x2.v8i16.p0i16(i16*) nounwind readonly +declare %struct.uint16x8x3_t @llvm.arm.neon.vld1x3.v8i16.p0i16(i16*) nounwind readonly +declare %struct.uint16x8x4_t @llvm.arm.neon.vld1x4.v8i16.p0i16(i16*) nounwind readonly + +declare %struct.uint32x4x2_t @llvm.arm.neon.vld1x2.v4i32.p0i32(i32*) nounwind readonly +declare %struct.uint32x4x3_t @llvm.arm.neon.vld1x3.v4i32.p0i32(i32*) nounwind readonly +declare %struct.uint32x4x4_t @llvm.arm.neon.vld1x4.v4i32.p0i32(i32*) nounwind readonly + +declare %struct.uint64x2x2_t @llvm.arm.neon.vld1x2.v2i64.p0i64(i64*) nounwind readonly +declare %struct.uint64x2x3_t @llvm.arm.neon.vld1x3.v2i64.p0i64(i64*) nounwind readonly +declare %struct.uint64x2x4_t @llvm.arm.neon.vld1x4.v2i64.p0i64(i64*) nounwind readonly + +declare %struct.uint8x16x2_t @llvm.arm.neon.vld1x2.v16i8.p0i8(i8*) nounwind readonly +declare %struct.uint8x16x3_t @llvm.arm.neon.vld1x3.v16i8.p0i8(i8*) nounwind readonly +declare %struct.uint8x16x4_t @llvm.arm.neon.vld1x4.v16i8.p0i8(i8*) nounwind readonly + +; CHECK-LABEL: test_vld1_u16_x2 +; CHECK: vld1.16 {d16, d17}, [r0:64] +define %struct.uint16x4x2_t @test_vld1_u16_x2(i16* %a) nounwind { + %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0i16(i16* %a) + ret %struct.uint16x4x2_t %tmp +} + +; CHECK-LABEL: test_vld1_u16_x3 +; CHECK: vld1.16 {d16, d17, d18}, [r1:64] +define %struct.uint16x4x3_t @test_vld1_u16_x3(i16* %a) nounwind { + %tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld1x3.v4i16.p0i16(i16* %a) + ret %struct.uint16x4x3_t %tmp +} + +; CHECK-LABEL: test_vld1_u16_x4 +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256] +define %struct.uint16x4x4_t @test_vld1_u16_x4(i16* %a) nounwind { + %tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld1x4.v4i16.p0i16(i16* %a) + ret %struct.uint16x4x4_t %tmp +} + +; CHECK-LABEL: test_vld1_u32_x2 +; CHECK: vld1.32 {d16, d17}, [r0:64] +define %struct.uint32x2x2_t @test_vld1_u32_x2(i32* %a) nounwind { + %tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld1x2.v2i32.p0i32(i32* %a) + ret %struct.uint32x2x2_t %tmp +} + +; CHECK-LABEL: test_vld1_u32_x3 +; CHECK: vld1.32 {d16, d17, d18}, [r1:64] +define %struct.uint32x2x3_t @test_vld1_u32_x3(i32* %a) nounwind { + %tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld1x3.v2i32.p0i32(i32* %a) + ret %struct.uint32x2x3_t %tmp +} + +; CHECK-LABEL: test_vld1_u32_x4 +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256] +define %struct.uint32x2x4_t @test_vld1_u32_x4(i32* %a) nounwind { + %tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld1x4.v2i32.p0i32(i32* %a) + ret %struct.uint32x2x4_t %tmp +} + +; CHECK-LABEL: test_vld1_u64_x2 +; CHECK: vld1.64 {d16, d17}, [r0:64] +define %struct.uint64x1x2_t @test_vld1_u64_x2(i64* %a) nounwind { + %tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld1x2.v1i64.p0i64(i64* %a) + ret %struct.uint64x1x2_t %tmp +} + +; CHECK-LABEL: test_vld1_u64_x3 +; CHECK: vld1.64 {d16, d17, d18}, [r1:64] +define %struct.uint64x1x3_t @test_vld1_u64_x3(i64* %a) nounwind { + %tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld1x3.v1i64.p0i64(i64* %a) + ret %struct.uint64x1x3_t %tmp +} + +; CHECK-LABEL: test_vld1_u64_x4 +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256] +define %struct.uint64x1x4_t @test_vld1_u64_x4(i64* %a) nounwind { + %tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld1x4.v1i64.p0i64(i64* %a) + ret %struct.uint64x1x4_t %tmp +} + +; CHECK-LABEL: test_vld1_u8_x2 +; CHECK: vld1.8 {d16, d17}, [r0:64] +define %struct.uint8x8x2_t @test_vld1_u8_x2(i8* %a) nounwind { + %tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld1x2.v8i8.p0i8(i8* %a) + ret %struct.uint8x8x2_t %tmp +} + +; CHECK-LABEL: test_vld1_u8_x3 +; CHECK: vld1.8 {d16, d17, d18}, [r1:64] +define %struct.uint8x8x3_t @test_vld1_u8_x3(i8* %a) nounwind { + %tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld1x3.v8i8.p0i8(i8* %a) + ret %struct.uint8x8x3_t %tmp +} + +; CHECK-LABEL: test_vld1_u8_x4 +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256] +define %struct.uint8x8x4_t @test_vld1_u8_x4(i8* %a) nounwind { + %tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld1x4.v8i8.p0i8(i8* %a) + ret %struct.uint8x8x4_t %tmp +} + +; CHECK-LABEL: test_vld1q_u16_x2 +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256] +define %struct.uint16x8x2_t @test_vld1q_u16_x2(i16* %a) nounwind { + %tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld1x2.v8i16.p0i16(i16* %a) + ret %struct.uint16x8x2_t %tmp +} + +; CHECK-LABEL: test_vld1q_u16_x3 +; CHECK: vld1.16 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.16 {d19, d20, d21}, [r1:64] +define %struct.uint16x8x3_t @test_vld1q_u16_x3(i16* %a) nounwind { + %tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld1x3.v8i16.p0i16(i16* %a) + ret %struct.uint16x8x3_t %tmp +} + +; CHECK-LABEL: test_vld1q_u16_x4 +; CHECK: vld1.16 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.16 {d20, d21, d22, d23}, [r1:256] +define %struct.uint16x8x4_t @test_vld1q_u16_x4(i16* %a) nounwind { + %tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld1x4.v8i16.p0i16(i16* %a) + ret %struct.uint16x8x4_t %tmp +} + +; CHECK-LABEL: test_vld1q_u32_x2 +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256] +define %struct.uint32x4x2_t @test_vld1q_u32_x2(i32* %a) nounwind { + %tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld1x2.v4i32.p0i32(i32* %a) + ret %struct.uint32x4x2_t %tmp +} + +; CHECK-LABEL: test_vld1q_u32_x3 +; CHECK: vld1.32 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.32 {d19, d20, d21}, [r1:64] +define %struct.uint32x4x3_t @test_vld1q_u32_x3(i32* %a) nounwind { + %tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld1x3.v4i32.p0i32(i32* %a) + ret %struct.uint32x4x3_t %tmp +} + +; CHECK-LABEL: test_vld1q_u32_x4 +; CHECK: vld1.32 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.32 {d20, d21, d22, d23}, [r1:256] +define %struct.uint32x4x4_t @test_vld1q_u32_x4(i32* %a) nounwind { + %tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld1x4.v4i32.p0i32(i32* %a) + ret %struct.uint32x4x4_t %tmp +} + +; CHECK-LABEL: test_vld1q_u64_x2 +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256] +define %struct.uint64x2x2_t @test_vld1q_u64_x2(i64* %a) nounwind { + %tmp = tail call %struct.uint64x2x2_t @llvm.arm.neon.vld1x2.v2i64.p0i64(i64* %a) + ret %struct.uint64x2x2_t %tmp +} + +; CHECK-LABEL: test_vld1q_u64_x3 +; CHECK: vld1.64 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.64 {d19, d20, d21}, [r1:64] +define %struct.uint64x2x3_t @test_vld1q_u64_x3(i64* %a) nounwind { + %tmp = tail call %struct.uint64x2x3_t @llvm.arm.neon.vld1x3.v2i64.p0i64(i64* %a) + ret %struct.uint64x2x3_t %tmp +} + +; CHECK-LABEL: test_vld1q_u64_x4 +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.64 {d20, d21, d22, d23}, [r1:256] +define %struct.uint64x2x4_t @test_vld1q_u64_x4(i64* %a) nounwind { + %tmp = tail call %struct.uint64x2x4_t @llvm.arm.neon.vld1x4.v2i64.p0i64(i64* %a) + ret %struct.uint64x2x4_t %tmp +} + +; CHECK-LABEL: test_vld1q_u8_x2 +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256] +define %struct.uint8x16x2_t @test_vld1q_u8_x2(i8* %a) nounwind { + %tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld1x2.v16i8.p0i8(i8* %a) + ret %struct.uint8x16x2_t %tmp +} + +; CHECK-LABEL: test_vld1q_u8_x3 +; CHECK: vld1.8 {d16, d17, d18}, [r1:64]! +; CHECK: vld1.8 {d19, d20, d21}, [r1:64] +define %struct.uint8x16x3_t @test_vld1q_u8_x3(i8* %a) nounwind { + %tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld1x3.v16i8.p0i8(i8* %a) + ret %struct.uint8x16x3_t %tmp +} + +; CHECK-LABEL: test_vld1q_u8_x4 +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256]! +; CHECK: vld1.8 {d20, d21, d22, d23}, [r1:256] +define %struct.uint8x16x4_t @test_vld1q_u8_x4(i8* %a) nounwind { + %tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld1x4.v16i8.p0i8(i8* %a) + ret %struct.uint8x16x4_t %tmp +}