Index: include/llvm/IR/IntrinsicsARM.td =================================================================== --- include/llvm/IR/IntrinsicsARM.td +++ include/llvm/IR/IntrinsicsARM.td @@ -659,6 +659,20 @@ LLVMMatchType<1>, llvm_i32_ty], [IntrArgMemOnly]>; +def int_arm_neon_vst1x2 : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrArgMemOnly, NoCapture<0>]>; +def int_arm_neon_vst1x3 : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>], + [IntrArgMemOnly, NoCapture<0>]>; +def int_arm_neon_vst1x4 : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>, + LLVMMatchType<1>], + [IntrArgMemOnly, NoCapture<0>]>; + // Vector store N-element structure from one lane. // Source operands are: the address, the N vectors, the lane number, and // the alignment. Index: lib/Target/ARM/ARMExpandPseudoInsts.cpp =================================================================== --- lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -110,6 +110,9 @@ // OddDblSpc depending on the lane number operand. enum NEONRegSpacing { SingleSpc, + SingleLowSpc, // Single spacing, low registers, three and four vectors. + SingleHighQSpc, // Single spacing, high registers, four vectors. + SingleHighTSpc, // Single spacing, high registers, three vectors. EvenDblSpc, OddDblSpc }; @@ -259,12 +262,34 @@ { ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, false, EvenDblSpc, 1, 8 ,true}, { ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true}, +{ ARM::VST1d16QPseudo, ARM::VST1d16Q, false, false, false, SingleSpc, 4, 4 ,false}, +{ ARM::VST1d16TPseudo, ARM::VST1d16T, false, false, false, SingleSpc, 3, 4 ,false}, +{ ARM::VST1d32QPseudo, ARM::VST1d32Q, false, false, false, SingleSpc, 4, 2 ,false}, +{ ARM::VST1d32TPseudo, ARM::VST1d32T, false, false, false, SingleSpc, 3, 2 ,false}, { ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false}, { ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false}, { ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false}, { ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false}, { ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false}, { ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false}, +{ ARM::VST1d8QPseudo, ARM::VST1d8Q, false, false, false, SingleSpc, 4, 8 ,false}, +{ ARM::VST1d8TPseudo, ARM::VST1d8T, false, false, false, SingleSpc, 3, 8 ,false}, +{ ARM::VST1q16HighQPseudo, ARM::VST1d16Q, false, false, false, SingleHighQSpc, 4, 4 ,false}, +{ ARM::VST1q16HighTPseudo, ARM::VST1d16T, false, false, false, SingleHighTSpc, 3, 4 ,false}, +{ ARM::VST1q16LowQPseudo_UPD, ARM::VST1d16Qwb_fixed, false, true, true, SingleLowSpc, 4, 4 ,false}, +{ ARM::VST1q16LowTPseudo_UPD, ARM::VST1d16Twb_fixed, false, true, true, SingleLowSpc, 3, 4 ,false}, +{ ARM::VST1q32HighQPseudo, ARM::VST1d32Q, false, false, false, SingleHighQSpc, 4, 2 ,false}, +{ ARM::VST1q32HighTPseudo, ARM::VST1d32T, false, false, false, SingleHighTSpc, 3, 2 ,false}, +{ ARM::VST1q32LowQPseudo_UPD, ARM::VST1d32Qwb_fixed, false, true, true, SingleLowSpc, 4, 2 ,false}, +{ ARM::VST1q32LowTPseudo_UPD, ARM::VST1d32Twb_fixed, false, true, true, SingleLowSpc, 3, 2 ,false}, +{ ARM::VST1q64HighQPseudo, ARM::VST1d64Q, false, false, false, SingleHighQSpc, 4, 1 ,false}, +{ ARM::VST1q64HighTPseudo, ARM::VST1d64T, false, false, false, SingleHighTSpc, 3, 1 ,false}, +{ ARM::VST1q64LowQPseudo_UPD, ARM::VST1d64Qwb_fixed, false, true, true, SingleLowSpc, 4, 1 ,false}, +{ ARM::VST1q64LowTPseudo_UPD, ARM::VST1d64Twb_fixed, false, true, true, SingleLowSpc, 3, 1 ,false}, +{ ARM::VST1q8HighQPseudo, ARM::VST1d8Q, false, false, false, SingleHighQSpc, 4, 8 ,false}, +{ ARM::VST1q8HighTPseudo, ARM::VST1d8T, false, false, false, SingleHighTSpc, 3, 8 ,false}, +{ ARM::VST1q8LowQPseudo_UPD, ARM::VST1d8Qwb_fixed, false, true, true, SingleLowSpc, 4, 8 ,false}, +{ ARM::VST1q8LowTPseudo_UPD, ARM::VST1d8Twb_fixed, false, true, true, SingleLowSpc, 3, 8 ,false}, { ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, false, SingleSpc, 2, 4 ,true}, { ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true, SingleSpc, 2, 4 ,true}, @@ -370,11 +395,21 @@ static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, const TargetRegisterInfo *TRI, unsigned &D0, unsigned &D1, unsigned &D2, unsigned &D3) { - if (RegSpc == SingleSpc) { + if (RegSpc == SingleSpc || RegSpc == SingleLowSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_0); D1 = TRI->getSubReg(Reg, ARM::dsub_1); D2 = TRI->getSubReg(Reg, ARM::dsub_2); D3 = TRI->getSubReg(Reg, ARM::dsub_3); + } else if (RegSpc == SingleHighQSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_4); + D1 = TRI->getSubReg(Reg, ARM::dsub_5); + D2 = TRI->getSubReg(Reg, ARM::dsub_6); + D3 = TRI->getSubReg(Reg, ARM::dsub_7); + } else if (RegSpc == SingleHighTSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_3); + D1 = TRI->getSubReg(Reg, ARM::dsub_4); + D2 = TRI->getSubReg(Reg, ARM::dsub_5); + D3 = TRI->getSubReg(Reg, ARM::dsub_6); } else if (RegSpc == EvenDblSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_0); D1 = TRI->getSubReg(Reg, ARM::dsub_2); @@ -474,9 +509,31 @@ // Copy the addrmode6 operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); - // Copy the am6offset operand. - if (TableEntry->hasWritebackOperand) - MIB.add(MI.getOperand(OpIdx++)); + + if (TableEntry->hasWritebackOperand) { + // TODO: The writing-back pseudo instructions we translate here are all + // defined to take am6offset nodes that are capable to represent both fixed + // and register forms. Some real instructions, however, do not rely on + // am6offset and have separate definitions for such forms. When this is the + // case, fixed forms do not take any offset nodes, so here we skip them for + // such intructions. Once all real and pseudo writing-back instructions are + // rewritten without use of am6offset nodes, this code will go away. + const MachineOperand &AM6Offset = MI.getOperand(OpIdx++); + if (TableEntry->RealOpc == ARM::VST1d8Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d16Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d32Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d64Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d8Twb_fixed || + TableEntry->RealOpc == ARM::VST1d16Twb_fixed || + TableEntry->RealOpc == ARM::VST1d32Twb_fixed || + TableEntry->RealOpc == ARM::VST1d64Twb_fixed) { + assert(AM6Offset.getReg() == 0 && + "A fixed writing-back pseudo intruction provides an offset " + "register!"); + } else { + MIB.add(AM6Offset); + } + } bool SrcIsKill = MI.getOperand(OpIdx).isKill(); bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); @@ -1563,6 +1620,9 @@ case ARM::VST3d8Pseudo: case ARM::VST3d16Pseudo: case ARM::VST3d32Pseudo: + case ARM::VST1d8TPseudo: + case ARM::VST1d16TPseudo: + case ARM::VST1d32TPseudo: case ARM::VST1d64TPseudo: case ARM::VST3d8Pseudo_UPD: case ARM::VST3d16Pseudo_UPD: @@ -1581,12 +1641,31 @@ case ARM::VST4d8Pseudo: case ARM::VST4d16Pseudo: case ARM::VST4d32Pseudo: + case ARM::VST1d8QPseudo: + case ARM::VST1d16QPseudo: + case ARM::VST1d32QPseudo: case ARM::VST1d64QPseudo: case ARM::VST4d8Pseudo_UPD: case ARM::VST4d16Pseudo_UPD: case ARM::VST4d32Pseudo_UPD: case ARM::VST1d64QPseudoWB_fixed: case ARM::VST1d64QPseudoWB_register: + case ARM::VST1q8HighQPseudo: + case ARM::VST1q8LowQPseudo_UPD: + case ARM::VST1q8HighTPseudo: + case ARM::VST1q8LowTPseudo_UPD: + case ARM::VST1q16HighQPseudo: + case ARM::VST1q16LowQPseudo_UPD: + case ARM::VST1q16HighTPseudo: + case ARM::VST1q16LowTPseudo_UPD: + case ARM::VST1q32HighQPseudo: + case ARM::VST1q32LowQPseudo_UPD: + case ARM::VST1q32HighTPseudo: + case ARM::VST1q32LowTPseudo_UPD: + case ARM::VST1q64HighQPseudo: + case ARM::VST1q64LowQPseudo_UPD: + case ARM::VST1q64HighTPseudo: + case ARM::VST1q64LowTPseudo_UPD: case ARM::VST4q8Pseudo_UPD: case ARM::VST4q16Pseudo_UPD: case ARM::VST4q32Pseudo_UPD: Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1905,9 +1905,7 @@ case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; case MVT::v2f64: - case MVT::v2i64: OpcodeIndex = 3; - assert(NumVecs == 1 && "v2i64 type only supported for VST1"); - break; + case MVT::v2i64: OpcodeIndex = 3; break; } std::vector ResTys; @@ -3519,6 +3517,51 @@ return; } + case Intrinsic::arm_neon_vst1x2: { + static const uint16_t DOpcodes[] = { ARM::VST1q8, ARM::VST1q16, + ARM::VST1q32, ARM::VST1q64 }; + static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudo, + ARM::VST1d16QPseudo, + ARM::VST1d32QPseudo, + ARM::VST1d64QPseudo }; + SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); + return; + } + + case Intrinsic::arm_neon_vst1x3: { + static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudo, + ARM::VST1d16TPseudo, + ARM::VST1d32TPseudo, + ARM::VST1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD, + ARM::VST1q16LowTPseudo_UPD, + ARM::VST1q32LowTPseudo_UPD, + ARM::VST1q64LowTPseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo, + ARM::VST1q16HighTPseudo, + ARM::VST1q32HighTPseudo, + ARM::VST1q64HighTPseudo }; + SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + + case Intrinsic::arm_neon_vst1x4: { + static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudo, + ARM::VST1d16QPseudo, + ARM::VST1d32QPseudo, + ARM::VST1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD, + ARM::VST1q16LowQPseudo_UPD, + ARM::VST1q32LowQPseudo_UPD, + ARM::VST1q64LowQPseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo, + ARM::VST1q16HighQPseudo, + ARM::VST1q32HighQPseudo, + ARM::VST1q64HighQPseudo }; + SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + case Intrinsic::arm_neon_vst2: { static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, ARM::VST2d32, ARM::VST1q64 }; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -12802,6 +12802,9 @@ case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst1x2: + case Intrinsic::arm_neon_vst1x3: + case Intrinsic::arm_neon_vst1x4: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: @@ -14128,6 +14131,27 @@ Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::arm_neon_vst1x2: + case Intrinsic::arm_neon_vst1x3: + case Intrinsic::arm_neon_vst1x4: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = 0; + // volatile stores with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOStore; + return true; + } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -1777,10 +1777,22 @@ defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>; defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>; +def VST1d8TPseudo : VSTQQPseudo, Sched<[WriteVST3]>; +def VST1d16TPseudo : VSTQQPseudo, Sched<[WriteVST3]>; +def VST1d32TPseudo : VSTQQPseudo, Sched<[WriteVST3]>; def VST1d64TPseudo : VSTQQPseudo, Sched<[WriteVST3]>; def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo, Sched<[WriteVST3]>; def VST1d64TPseudoWB_register : VSTQQWBPseudo, Sched<[WriteVST3]>; +def VST1q8HighTPseudo : VSTQQQQPseudo, Sched<[WriteVST3]>; +def VST1q8LowTPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; +def VST1q16HighTPseudo : VSTQQQQPseudo, Sched<[WriteVST3]>; +def VST1q16LowTPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; +def VST1q32HighTPseudo : VSTQQQQPseudo, Sched<[WriteVST3]>; +def VST1q32LowTPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; +def VST1q64HighTPseudo : VSTQQQQPseudo, Sched<[WriteVST3]>; +def VST1q64LowTPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; + // ...with 4 registers class VST1D4 op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b00, 0b0010, op7_4, (outs), @@ -1820,10 +1832,22 @@ defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; +def VST1d8QPseudo : VSTQQPseudo, Sched<[WriteVST4]>; +def VST1d16QPseudo : VSTQQPseudo, Sched<[WriteVST4]>; +def VST1d32QPseudo : VSTQQPseudo, Sched<[WriteVST4]>; def VST1d64QPseudo : VSTQQPseudo, Sched<[WriteVST4]>; def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo, Sched<[WriteVST4]>; def VST1d64QPseudoWB_register : VSTQQWBPseudo, Sched<[WriteVST4]>; +def VST1q8HighQPseudo : VSTQQQQPseudo, Sched<[WriteVST4]>; +def VST1q8LowQPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; +def VST1q16HighQPseudo : VSTQQQQPseudo, Sched<[WriteVST4]>; +def VST1q16LowQPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; +def VST1q32HighQPseudo : VSTQQQQPseudo, Sched<[WriteVST4]>; +def VST1q32LowQPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; +def VST1q64HighQPseudo : VSTQQQQPseudo, Sched<[WriteVST4]>; +def VST1q64LowQPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; + // VST2 : Vector Store (multiple 2-element structures) class VST2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> Index: test/CodeGen/ARM/arm-vst1.ll =================================================================== --- test/CodeGen/ARM/arm-vst1.ll +++ test/CodeGen/ARM/arm-vst1.ll @@ -0,0 +1,363 @@ +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \ +; RUN: -asm-verbose=false | FileCheck %s + +; %struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> } +; %struct.uint16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } +; %struct.uint16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } + +; %struct.uint32x2x2_t = type { <2 x i32>, <2 x i32> } +; %struct.uint32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } +; %struct.uint32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } + +; %struct.uint64x1x2_t = type { <1 x i64>, <1 x i64> } +; %struct.uint64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> } +; %struct.uint64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } + +; %struct.uint8x8x2_t = type { <8 x i8>, <8 x i8> } +; %struct.uint8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } +; %struct.uint8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } + +; %struct.uint16x8x2_t = type { <8 x i16>, <8 x i16> } +; %struct.uint16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } +; %struct.uint16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } + +; %struct.uint32x4x2_t = type { <4 x i32>, <4 x i32> } +; %struct.uint32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } +; %struct.uint32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } + +; %struct.uint64x2x2_t = type { <2 x i64>, <2 x i64> } +; %struct.uint64x2x3_t = type { <2 x i64>, <2 x i64>, <2 x i64> } +; %struct.uint64x2x4_t = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } + +; %struct.uint8x16x2_t = type { <16 x i8>, <16 x i8> } +; %struct.uint8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> } +; %struct.uint8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } + +%struct.uint16x4x2_t = type { [2 x <4 x i16>] } +%struct.uint16x4x3_t = type { [3 x <4 x i16>] } +%struct.uint16x4x4_t = type { [4 x <4 x i16>] } +%struct.uint32x2x2_t = type { [2 x <2 x i32>] } +%struct.uint32x2x3_t = type { [3 x <2 x i32>] } +%struct.uint32x2x4_t = type { [4 x <2 x i32>] } +%struct.uint64x1x2_t = type { [2 x <1 x i64>] } +%struct.uint64x1x3_t = type { [3 x <1 x i64>] } +%struct.uint64x1x4_t = type { [4 x <1 x i64>] } +%struct.uint8x8x2_t = type { [2 x <8 x i8>] } +%struct.uint8x8x3_t = type { [3 x <8 x i8>] } +%struct.uint8x8x4_t = type { [4 x <8 x i8>] } +%struct.uint16x8x2_t = type { [2 x <8 x i16>] } +%struct.uint16x8x3_t = type { [3 x <8 x i16>] } +%struct.uint16x8x4_t = type { [4 x <8 x i16>] } +%struct.uint32x4x2_t = type { [2 x <4 x i32>] } +%struct.uint32x4x3_t = type { [3 x <4 x i32>] } +%struct.uint32x4x4_t = type { [4 x <4 x i32>] } +%struct.uint64x2x2_t = type { [2 x <2 x i64>] } +%struct.uint64x2x3_t = type { [3 x <2 x i64>] } +%struct.uint64x2x4_t = type { [4 x <2 x i64>] } +%struct.uint8x16x2_t = type { [2 x <16 x i8>] } +%struct.uint8x16x3_t = type { [3 x <16 x i8>] } +%struct.uint8x16x4_t = type { [4 x <16 x i8>] } + +declare void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* nocapture, <4 x i16>, <4 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* nocapture, <4 x i16>, <4 x i16>, <4 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* nocapture, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* nocapture, <2 x i32>, <2 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* nocapture, <2 x i32>, <2 x i32>, <2 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* nocapture, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* nocapture, <1 x i64>, <1 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* nocapture, <1 x i64>, <1 x i64>, <1 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* nocapture, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* nocapture, <8 x i8>, <8 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* nocapture, <8 x i8>, <8 x i8>, <8 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* nocapture, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* nocapture, <8 x i16>, <8 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* nocapture, <8 x i16>, <8 x i16>, <8 x i16>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* nocapture, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* nocapture, <4 x i32>, <4 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* nocapture, <4 x i32>, <4 x i32>, <4 x i32>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* nocapture, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* nocapture, <2 x i64>, <2 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* nocapture, <2 x i64>, <2 x i64>, <2 x i64>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* nocapture, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>) argmemonly nounwind + +declare void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>, <16 x i8>) argmemonly nounwind +declare void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) argmemonly nounwind + +; CHECK-LABEL: test_vst1_u16_x2 +; CHECK: vst1.16 {d16, d17}, [r0:64] +define void @test_vst1_u16_x2(i16* %a, %struct.uint16x4x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* %a, <4 x i16> %b0, <4 x i16> %b1) + ret void +} + +; CHECK-LABEL: test_vst1_u16_x3 +; CHECK: vst1.16 {d16, d17, d18}, [r0:64] +define void @test_vst1_u16_x3(i16* %a, %struct.uint16x4x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x4x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint16x4x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* %a, <4 x i16> %b0, <4 x i16> %b1, <4 x i16> %b2) + ret void +} + +; CHECK-LABEL: test_vst1_u16_x4 +; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u16_x4(i16* %a, %struct.uint16x4x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x4x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint16x4x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint16x4x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* %a, <4 x i16> %b0, <4 x i16> %b1, <4 x i16> %b2, <4 x i16> %b3) + ret void +} + +; CHECK-LABEL: test_vst1_u32_x2 +; CHECK: vst1.32 {d16, d17}, [r0:64] +define void @test_vst1_u32_x2(i32* %a, %struct.uint32x2x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint32x2x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* %a, <2 x i32> %b0, <2 x i32> %b1) + ret void +} + +; CHECK-LABEL: test_vst1_u32_x3 +; CHECK: vst1.32 {d16, d17, d18}, [r0:64] +define void @test_vst1_u32_x3(i32* %a, %struct.uint32x2x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint32x2x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint32x2x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* %a, <2 x i32> %b0, <2 x i32> %b1, <2 x i32> %b2) + ret void +} + +; CHECK-LABEL: test_vst1_u32_x4 +; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u32_x4(i32* %a, %struct.uint32x2x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint32x2x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint32x2x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint32x2x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* %a, <2 x i32> %b0, <2 x i32> %b1, <2 x i32> %b2, <2 x i32> %b3) + ret void +} + +; CHECK-LABEL: test_vst1_u64_x2 +; CHECK: vst1.64 {d16, d17}, [r0:64] +define void @test_vst1_u64_x2(i64* %a, %struct.uint64x1x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint64x1x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* %a, <1 x i64> %b0, <1 x i64> %b1) + ret void +} + +; CHECK-LABEL: test_vst1_u64_x3 +; CHECK: vst1.64 {d16, d17, d18}, [r0:64] +define void @test_vst1_u64_x3(i64* %a, %struct.uint64x1x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint64x1x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint64x1x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* %a, <1 x i64> %b0, <1 x i64> %b1, <1 x i64> %b2) + ret void +} + +; CHECK-LABEL: test_vst1_u64_x4 +; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u64_x4(i64* %a, %struct.uint64x1x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint64x1x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint64x1x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint64x1x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* %a, <1 x i64> %b0, <1 x i64> %b1, <1 x i64> %b2, <1 x i64> %b3) + ret void +} + +; CHECK-LABEL: test_vst1_u8_x2 +; CHECK: vst1.8 {d16, d17}, [r0:64] +define void @test_vst1_u8_x2(i8* %a, %struct.uint8x8x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint8x8x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> %b0, <8 x i8> %b1) + ret void +} + +; CHECK-LABEL: test_vst1_u8_x3 +; CHECK: vst1.8 {d16, d17, d18}, [r0:64] +define void @test_vst1_u8_x3(i8* %a, %struct.uint8x8x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint8x8x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint8x8x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> %b0, <8 x i8> %b1, <8 x i8> %b2) + ret void +} + +; CHECK-LABEL: test_vst1_u8_x4 +; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1_u8_x4(i8* %a, %struct.uint8x8x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint8x8x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint8x8x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint8x8x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> %b0, <8 x i8> %b1, <8 x i8> %b2, <8 x i8> %b3) + ret void +} + +; CHECK-LABEL: test_vst1q_u16_x2 +; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u16_x2(i16* %a, %struct.uint16x8x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x8x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* %a, <8 x i16> %b0, <8 x i16> %b1) + ret void +} + +; CHECK-LABEL: test_vst1q_u16_x3 +; CHECK: vst1.16 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.16 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u16_x3(i16* %a, %struct.uint16x8x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint16x8x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x8x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint16x8x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* %a, <8 x i16> %b0, <8 x i16> %b1, <8 x i16> %b2) + ret void +} + +; CHECK-LABEL: test_vst1q_u16_x4 +; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.16 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u16_x4(i16* %a, %struct.uint16x8x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint16x8x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint16x8x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint16x8x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint16x8x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* %a, <8 x i16> %b0, <8 x i16> %b1, <8 x i16> %b2, <8 x i16> %b3) + ret void +} + +; CHECK-LABEL: test_vst1q_u32_x2 +; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u32_x2(i32* %a, %struct.uint32x4x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint32x4x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* %a, <4 x i32> %b0, <4 x i32> %b1) + ret void +} + +; CHECK-LABEL: test_vst1q_u32_x3 +; CHECK: vst1.32 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.32 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u32_x3(i32* %a, %struct.uint32x4x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint32x4x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint32x4x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint32x4x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* %a, <4 x i32> %b0, <4 x i32> %b1, <4 x i32> %b2) + ret void +} + +; CHECK-LABEL: test_vst1q_u32_x4 +; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.32 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u32_x4(i32* %a, %struct.uint32x4x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint32x4x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint32x4x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint32x4x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint32x4x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* %a, <4 x i32> %b0, <4 x i32> %b1, <4 x i32> %b2, <4 x i32> %b3) + ret void +} + +; CHECK-LABEL: test_vst1q_u64_x2 +; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u64_x2(i64* %a, %struct.uint64x2x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint64x2x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* %a, <2 x i64> %b0, <2 x i64> %b1) + ret void +} + +; CHECK-LABEL: test_vst1q_u64_x3 +; CHECK: vst1.64 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.64 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u64_x3(i64* %a, %struct.uint64x2x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint64x2x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint64x2x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint64x2x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* %a, <2 x i64> %b0, <2 x i64> %b1, <2 x i64> %b2) + ret void +} + +; CHECK-LABEL: test_vst1q_u64_x4 +; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.64 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u64_x4(i64* %a, %struct.uint64x2x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint64x2x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint64x2x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint64x2x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint64x2x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* %a, <2 x i64> %b0, <2 x i64> %b1, <2 x i64> %b2, <2 x i64> %b3) + ret void +} + +; CHECK-LABEL: test_vst1q_u8_x2 +; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256] +define void @test_vst1q_u8_x2(i8* %a, %struct.uint8x16x2_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0 + %b1 = extractvalue %struct.uint8x16x2_t %b, 0, 1 + tail call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> %b0, <16 x i8> %b1) + ret void +} + +; CHECK-LABEL: test_vst1q_u8_x3 +; CHECK: vst1.8 {d16, d17, d18}, [r0:64]! +; CHECK: vst1.8 {d19, d20, d21}, [r0:64] +define void @test_vst1q_u8_x3(i8* %a, %struct.uint8x16x3_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint8x16x3_t %b, 0, 0 + %b1 = extractvalue %struct.uint8x16x3_t %b, 0, 1 + %b2 = extractvalue %struct.uint8x16x3_t %b, 0, 2 + tail call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> %b0, <16 x i8> %b1, <16 x i8> %b2) + ret void +} + +; CHECK-LABEL: test_vst1q_u8_x4 +; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256]! +; CHECK: vst1.8 {d20, d21, d22, d23}, [r0:256] +define void @test_vst1q_u8_x4(i8* %a, %struct.uint8x16x4_t %b) nounwind { +entry: + %b0 = extractvalue %struct.uint8x16x4_t %b, 0, 0 + %b1 = extractvalue %struct.uint8x16x4_t %b, 0, 1 + %b2 = extractvalue %struct.uint8x16x4_t %b, 0, 2 + %b3 = extractvalue %struct.uint8x16x4_t %b, 0, 3 + tail call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> %b0, <16 x i8> %b1, <16 x i8> %b2, <16 x i8> %b3) + ret void +}