Index: lib/Target/ARM64/ARM64ISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM64/ARM64ISelDAGToDAG.cpp +++ lib/Target/ARM64/ARM64ISelDAGToDAG.cpp @@ -988,9 +988,12 @@ // Update uses of vector list SDValue SuperReg = SDValue(Ld, 1); - for (unsigned i = 0; i < NumVecs; ++i) - ReplaceUses(SDValue(N, i), - CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); + if (NumVecs == 1) + ReplaceUses(SDValue(N, 0), SuperReg); + else + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); // Update the chain ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); @@ -1153,14 +1156,20 @@ // Update uses of the vector list SDValue SuperReg = SDValue(Ld, 1); - EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, - ARM64::qsub3 }; - for (unsigned i = 0; i < NumVecs; ++i) { - SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); - if (Narrow) - NV = NarrowVector(NV, *CurDAG); - ReplaceUses(SDValue(N, i), NV); + if (NumVecs == 1) + ReplaceUses(SDValue(N, 0), + Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg); + else { + EVT WideVT = RegSeq.getOperand(1)->getValueType(0); + static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, + ARM64::qsub3 }; + for (unsigned i = 0; i < NumVecs; ++i) { + SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, + SuperReg); + if (Narrow) + NV = NarrowVector(NV, *CurDAG); + ReplaceUses(SDValue(N, i), NV); + } } // Update the Chain @@ -2657,6 +2666,25 @@ return SelectPostLoad(Node, 4, ARM64::LD1Fourv2d_POST, ARM64::qsub0); break; } + case ARM64ISD::LD1DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 1, ARM64::LD1Rv8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 1, ARM64::LD1Rv16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 1, ARM64::LD1Rv4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 1, ARM64::LD1Rv8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 1, ARM64::LD1Rv2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 1, ARM64::LD1Rv4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 1, ARM64::LD1Rv1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 1, ARM64::LD1Rv2d_POST, ARM64::qsub0); + break; + } case ARM64ISD::LD2DUPpost: { if (VT == MVT::v8i8) return SelectPostLoad(Node, 2, ARM64::LD2Rv8b_POST, ARM64::dsub0); @@ -2714,6 +2742,19 @@ return SelectPostLoad(Node, 4, ARM64::LD4Rv2d_POST, ARM64::qsub0); break; } + case ARM64ISD::LD1LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 1, ARM64::LD1i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 1, ARM64::LD1i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 1, ARM64::LD1i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 1, ARM64::LD1i64_POST); + break; + } case ARM64ISD::LD2LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 2, ARM64::LD2i8_POST); Index: lib/Target/ARM64/ARM64ISelLowering.h =================================================================== --- lib/Target/ARM64/ARM64ISelLowering.h +++ lib/Target/ARM64/ARM64ISelLowering.h @@ -175,9 +175,11 @@ ST1x2post, ST1x3post, ST1x4post, + LD1DUPpost, LD2DUPpost, LD3DUPpost, LD4DUPpost, + LD1LANEpost, LD2LANEpost, LD3LANEpost, LD4LANEpost, Index: lib/Target/ARM64/ARM64ISelLowering.cpp =================================================================== --- lib/Target/ARM64/ARM64ISelLowering.cpp +++ lib/Target/ARM64/ARM64ISelLowering.cpp @@ -371,6 +371,7 @@ setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -744,9 +745,11 @@ case ARM64ISD::ST1x2post: return "ARM64ISD::ST1x2post"; case ARM64ISD::ST1x3post: return "ARM64ISD::ST1x3post"; case ARM64ISD::ST1x4post: return "ARM64ISD::ST1x4post"; + case ARM64ISD::LD1DUPpost: return "ARM64ISD::LD1DUPpost"; case ARM64ISD::LD2DUPpost: return "ARM64ISD::LD2DUPpost"; case ARM64ISD::LD3DUPpost: return "ARM64ISD::LD3DUPpost"; case ARM64ISD::LD4DUPpost: return "ARM64ISD::LD4DUPpost"; + case ARM64ISD::LD1LANEpost: return "ARM64ISD::LD1LANEpost"; case ARM64ISD::LD2LANEpost: return "ARM64ISD::LD2LANEpost"; case ARM64ISD::LD3LANEpost: return "ARM64ISD::LD3LANEpost"; case ARM64ISD::LD4LANEpost: return "ARM64ISD::LD4LANEpost"; @@ -7078,6 +7081,87 @@ S->getAlignment()); } +/// Target-specific DAG combine function for post-increment LD1 (lane) and +/// post-increment LD1R. +static SDValue performPostLD1Combine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + bool IsLaneOp) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + unsigned LoadIdx = IsLaneOp ? 1 : 0; + SDNode *LD = N->getOperand(LoadIdx).getNode(); + // If it is not LOAD/EXTLOAD, can not do such combine. + if (LD->getOpcode() != ISD::LOAD && LD->getOpcode() != ISD::EXTLOAD) + return SDValue(); + + // Check if there are other uses. If so, do not combine as it will introduce + // an extra load. + for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; + ++UI) { + if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. + continue; + if (*UI != N) + return SDValue(); + } + + SDValue Addr = LD->getOperand(1); + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = + Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD + || UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load. Otherwise, folding it + // would create a cycle. + if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) + continue; + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + unsigned NumBytes = (VT.getSizeInBits() / 8) / VT.getVectorNumElements(); + if (IncVal != NumBytes) + continue; + Inc = DAG.getRegister(ARM64::XZR, MVT::i64); + } + + SmallVector Ops; + Ops.push_back(LD->getOperand(0)); // Chain + if (IsLaneOp) { + Ops.push_back(N->getOperand(0)); // The vector to be inserted + Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector + } + Ops.push_back(Addr); + Ops.push_back(Inc); + + EVT Tys[3] = { VT, MVT::i64, MVT::Other }; + SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, 3)); + LoadSDNode *LoadSD = cast(LD); + unsigned NewOp = IsLaneOp ? ARM64ISD::LD1LANEpost : ARM64ISD::LD1DUPpost; + SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, + LoadSD->getMemoryVT(), + LoadSD->getMemOperand()); + + // Update the uses. + std::vector NewResults; + NewResults.push_back(SDValue(LD, 0)); // The result of load + NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain + DCI.CombineTo(LD, NewResults); + DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result + DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write backe register + + break; + } + return SDValue(); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -7178,7 +7262,7 @@ if (IsLaneOp || IsStore) for (unsigned i = 2; i < AddrOpIdx; ++i) Ops.push_back(N->getOperand(i)); - Ops.push_back(N->getOperand(AddrOpIdx)); // Base register + Ops.push_back(Addr); // Base register Ops.push_back(Inc); // Return Types. @@ -7368,6 +7452,10 @@ return performSTORECombine(N, DCI, DAG, Subtarget); case ARM64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); + case ARM64ISD::DUP: + return performPostLD1Combine(N, DCI, false); + case ISD::INSERT_VECTOR_ELT: + return performPostLD1Combine(N, DCI, true); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { Index: test/CodeGen/ARM64/indexed-vector-ldst.ll =================================================================== --- test/CodeGen/ARM64/indexed-vector-ldst.ll +++ test/CodeGen/ARM64/indexed-vector-ldst.ll @@ -5687,4 +5687,488 @@ ret double* %tmp } -declare void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) \ No newline at end of file +declare void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) + +define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) { +; CHECK-LABEL: test_v16i8_post_imm_ld1r: +; CHECK: ld1r.16b { v0 }, [x0], #1 + %tmp1 = load i8* %bar + %tmp2 = insertelement <16 x i8> , i8 %tmp1, i32 0 + %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1 + %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2 + %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3 + %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4 + %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5 + %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6 + %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7 + %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8 + %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9 + %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10 + %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11 + %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12 + %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13 + %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14 + %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15 + %tmp18 = getelementptr i8* %bar, i64 1 + store i8* %tmp18, i8** %ptr + ret <16 x i8> %tmp17 +} + +define <16 x i8> @test_v16i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) { +; CHECK-LABEL: test_v16i8_post_reg_ld1r: +; CHECK: ld1r.16b { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load i8* %bar + %tmp2 = insertelement <16 x i8> , i8 %tmp1, i32 0 + %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1 + %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2 + %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3 + %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4 + %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5 + %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6 + %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7 + %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8 + %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9 + %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10 + %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11 + %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12 + %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13 + %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14 + %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15 + %tmp18 = getelementptr i8* %bar, i64 %inc + store i8* %tmp18, i8** %ptr + ret <16 x i8> %tmp17 +} + +define <8 x i8> @test_v8i8_post_imm_ld1r(i8* %bar, i8** %ptr) { +; CHECK-LABEL: test_v8i8_post_imm_ld1r: +; CHECK: ld1r.8b { v0 }, [x0], #1 + %tmp1 = load i8* %bar + %tmp2 = insertelement <8 x i8> , i8 %tmp1, i32 0 + %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1 + %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2 + %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3 + %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4 + %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5 + %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6 + %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7 + %tmp10 = getelementptr i8* %bar, i64 1 + store i8* %tmp10, i8** %ptr + ret <8 x i8> %tmp9 +} + +define <8 x i8> @test_v8i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) { +; CHECK-LABEL: test_v8i8_post_reg_ld1r: +; CHECK: ld1r.8b { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load i8* %bar + %tmp2 = insertelement <8 x i8> , i8 %tmp1, i32 0 + %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1 + %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2 + %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3 + %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4 + %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5 + %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6 + %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7 + %tmp10 = getelementptr i8* %bar, i64 %inc + store i8* %tmp10, i8** %ptr + ret <8 x i8> %tmp9 +} + +define <8 x i16> @test_v8i16_post_imm_ld1r(i16* %bar, i16** %ptr) { +; CHECK-LABEL: test_v8i16_post_imm_ld1r: +; CHECK: ld1r.8h { v0 }, [x0], #2 + %tmp1 = load i16* %bar + %tmp2 = insertelement <8 x i16> , i16 %tmp1, i32 0 + %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1 + %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2 + %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3 + %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4 + %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5 + %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6 + %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7 + %tmp10 = getelementptr i16* %bar, i64 1 + store i16* %tmp10, i16** %ptr + ret <8 x i16> %tmp9 +} + +define <8 x i16> @test_v8i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) { +; CHECK-LABEL: test_v8i16_post_reg_ld1r: +; CHECK: ld1r.8h { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load i16* %bar + %tmp2 = insertelement <8 x i16> , i16 %tmp1, i32 0 + %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1 + %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2 + %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3 + %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4 + %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5 + %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6 + %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7 + %tmp10 = getelementptr i16* %bar, i64 %inc + store i16* %tmp10, i16** %ptr + ret <8 x i16> %tmp9 +} + +define <4 x i16> @test_v4i16_post_imm_ld1r(i16* %bar, i16** %ptr) { +; CHECK-LABEL: test_v4i16_post_imm_ld1r: +; CHECK: ld1r.4h { v0 }, [x0], #2 + %tmp1 = load i16* %bar + %tmp2 = insertelement <4 x i16> , i16 %tmp1, i32 0 + %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1 + %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2 + %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3 + %tmp6 = getelementptr i16* %bar, i64 1 + store i16* %tmp6, i16** %ptr + ret <4 x i16> %tmp5 +} + +define <4 x i16> @test_v4i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) { +; CHECK-LABEL: test_v4i16_post_reg_ld1r: +; CHECK: ld1r.4h { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load i16* %bar + %tmp2 = insertelement <4 x i16> , i16 %tmp1, i32 0 + %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1 + %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2 + %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3 + %tmp6 = getelementptr i16* %bar, i64 %inc + store i16* %tmp6, i16** %ptr + ret <4 x i16> %tmp5 +} + +define <4 x i32> @test_v4i32_post_imm_ld1r(i32* %bar, i32** %ptr) { +; CHECK-LABEL: test_v4i32_post_imm_ld1r: +; CHECK: ld1r.4s { v0 }, [x0], #4 + %tmp1 = load i32* %bar + %tmp2 = insertelement <4 x i32> , i32 %tmp1, i32 0 + %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1 + %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2 + %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3 + %tmp6 = getelementptr i32* %bar, i64 1 + store i32* %tmp6, i32** %ptr + ret <4 x i32> %tmp5 +} + +define <4 x i32> @test_v4i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) { +; CHECK-LABEL: test_v4i32_post_reg_ld1r: +; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load i32* %bar + %tmp2 = insertelement <4 x i32> , i32 %tmp1, i32 0 + %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1 + %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2 + %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3 + %tmp6 = getelementptr i32* %bar, i64 %inc + store i32* %tmp6, i32** %ptr + ret <4 x i32> %tmp5 +} + +define <2 x i32> @test_v2i32_post_imm_ld1r(i32* %bar, i32** %ptr) { +; CHECK-LABEL: test_v2i32_post_imm_ld1r: +; CHECK: ld1r.2s { v0 }, [x0], #4 + %tmp1 = load i32* %bar + %tmp2 = insertelement <2 x i32> , i32 %tmp1, i32 0 + %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1 + %tmp4 = getelementptr i32* %bar, i64 1 + store i32* %tmp4, i32** %ptr + ret <2 x i32> %tmp3 +} + +define <2 x i32> @test_v2i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) { +; CHECK-LABEL: test_v2i32_post_reg_ld1r: +; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load i32* %bar + %tmp2 = insertelement <2 x i32> , i32 %tmp1, i32 0 + %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1 + %tmp4 = getelementptr i32* %bar, i64 %inc + store i32* %tmp4, i32** %ptr + ret <2 x i32> %tmp3 +} + +define <2 x i64> @test_v2i64_post_imm_ld1r(i64* %bar, i64** %ptr) { +; CHECK-LABEL: test_v2i64_post_imm_ld1r: +; CHECK: ld1r.2d { v0 }, [x0], #8 + %tmp1 = load i64* %bar + %tmp2 = insertelement <2 x i64> , i64 %tmp1, i32 0 + %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1 + %tmp4 = getelementptr i64* %bar, i64 1 + store i64* %tmp4, i64** %ptr + ret <2 x i64> %tmp3 +} + +define <2 x i64> @test_v2i64_post_reg_ld1r(i64* %bar, i64** %ptr, i64 %inc) { +; CHECK-LABEL: test_v2i64_post_reg_ld1r: +; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load i64* %bar + %tmp2 = insertelement <2 x i64> , i64 %tmp1, i32 0 + %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1 + %tmp4 = getelementptr i64* %bar, i64 %inc + store i64* %tmp4, i64** %ptr + ret <2 x i64> %tmp3 +} + +define <4 x float> @test_v4f32_post_imm_ld1r(float* %bar, float** %ptr) { +; CHECK-LABEL: test_v4f32_post_imm_ld1r: +; CHECK: ld1r.4s { v0 }, [x0], #4 + %tmp1 = load float* %bar + %tmp2 = insertelement <4 x float> , float %tmp1, i32 0 + %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1 + %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2 + %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3 + %tmp6 = getelementptr float* %bar, i64 1 + store float* %tmp6, float** %ptr + ret <4 x float> %tmp5 +} + +define <4 x float> @test_v4f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) { +; CHECK-LABEL: test_v4f32_post_reg_ld1r: +; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load float* %bar + %tmp2 = insertelement <4 x float> , float %tmp1, i32 0 + %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1 + %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2 + %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3 + %tmp6 = getelementptr float* %bar, i64 %inc + store float* %tmp6, float** %ptr + ret <4 x float> %tmp5 +} + +define <2 x float> @test_v2f32_post_imm_ld1r(float* %bar, float** %ptr) { +; CHECK-LABEL: test_v2f32_post_imm_ld1r: +; CHECK: ld1r.2s { v0 }, [x0], #4 + %tmp1 = load float* %bar + %tmp2 = insertelement <2 x float> , float %tmp1, i32 0 + %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1 + %tmp4 = getelementptr float* %bar, i64 1 + store float* %tmp4, float** %ptr + ret <2 x float> %tmp3 +} + +define <2 x float> @test_v2f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) { +; CHECK-LABEL: test_v2f32_post_reg_ld1r: +; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load float* %bar + %tmp2 = insertelement <2 x float> , float %tmp1, i32 0 + %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1 + %tmp4 = getelementptr float* %bar, i64 %inc + store float* %tmp4, float** %ptr + ret <2 x float> %tmp3 +} + +define <2 x double> @test_v2f64_post_imm_ld1r(double* %bar, double** %ptr) { +; CHECK-LABEL: test_v2f64_post_imm_ld1r: +; CHECK: ld1r.2d { v0 }, [x0], #8 + %tmp1 = load double* %bar + %tmp2 = insertelement <2 x double> , double %tmp1, i32 0 + %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1 + %tmp4 = getelementptr double* %bar, i64 1 + store double* %tmp4, double** %ptr + ret <2 x double> %tmp3 +} + +define <2 x double> @test_v2f64_post_reg_ld1r(double* %bar, double** %ptr, i64 %inc) { +; CHECK-LABEL: test_v2f64_post_reg_ld1r: +; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}} + %tmp1 = load double* %bar + %tmp2 = insertelement <2 x double> , double %tmp1, i32 0 + %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1 + %tmp4 = getelementptr double* %bar, i64 %inc + store double* %tmp4, double** %ptr + ret <2 x double> %tmp3 +} + +define <16 x i8> @test_v16i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <16 x i8> %A) { +; CHECK-LABEL: test_v16i8_post_imm_ld1lane: +; CHECK: ld1.b { v0 }[1], [x0], #1 + %tmp1 = load i8* %bar + %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1 + %tmp3 = getelementptr i8* %bar, i64 1 + store i8* %tmp3, i8** %ptr + ret <16 x i8> %tmp2 +} + +define <16 x i8> @test_v16i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <16 x i8> %A) { +; CHECK-LABEL: test_v16i8_post_reg_ld1lane: +; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load i8* %bar + %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1 + %tmp3 = getelementptr i8* %bar, i64 %inc + store i8* %tmp3, i8** %ptr + ret <16 x i8> %tmp2 +} + +define <8 x i8> @test_v8i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <8 x i8> %A) { +; CHECK-LABEL: test_v8i8_post_imm_ld1lane: +; CHECK: ld1.b { v0 }[1], [x0], #1 + %tmp1 = load i8* %bar + %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1 + %tmp3 = getelementptr i8* %bar, i64 1 + store i8* %tmp3, i8** %ptr + ret <8 x i8> %tmp2 +} + +define <8 x i8> @test_v8i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <8 x i8> %A) { +; CHECK-LABEL: test_v8i8_post_reg_ld1lane: +; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load i8* %bar + %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1 + %tmp3 = getelementptr i8* %bar, i64 %inc + store i8* %tmp3, i8** %ptr + ret <8 x i8> %tmp2 +} + +define <8 x i16> @test_v8i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <8 x i16> %A) { +; CHECK-LABEL: test_v8i16_post_imm_ld1lane: +; CHECK: ld1.h { v0 }[1], [x0], #2 + %tmp1 = load i16* %bar + %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1 + %tmp3 = getelementptr i16* %bar, i64 1 + store i16* %tmp3, i16** %ptr + ret <8 x i16> %tmp2 +} + +define <8 x i16> @test_v8i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <8 x i16> %A) { +; CHECK-LABEL: test_v8i16_post_reg_ld1lane: +; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load i16* %bar + %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1 + %tmp3 = getelementptr i16* %bar, i64 %inc + store i16* %tmp3, i16** %ptr + ret <8 x i16> %tmp2 +} + +define <4 x i16> @test_v4i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <4 x i16> %A) { +; CHECK-LABEL: test_v4i16_post_imm_ld1lane: +; CHECK: ld1.h { v0 }[1], [x0], #2 + %tmp1 = load i16* %bar + %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1 + %tmp3 = getelementptr i16* %bar, i64 1 + store i16* %tmp3, i16** %ptr + ret <4 x i16> %tmp2 +} + +define <4 x i16> @test_v4i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A) { +; CHECK-LABEL: test_v4i16_post_reg_ld1lane: +; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load i16* %bar + %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1 + %tmp3 = getelementptr i16* %bar, i64 %inc + store i16* %tmp3, i16** %ptr + ret <4 x i16> %tmp2 +} + +define <4 x i32> @test_v4i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <4 x i32> %A) { +; CHECK-LABEL: test_v4i32_post_imm_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], #4 + %tmp1 = load i32* %bar + %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1 + %tmp3 = getelementptr i32* %bar, i64 1 + store i32* %tmp3, i32** %ptr + ret <4 x i32> %tmp2 +} + +define <4 x i32> @test_v4i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <4 x i32> %A) { +; CHECK-LABEL: test_v4i32_post_reg_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load i32* %bar + %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1 + %tmp3 = getelementptr i32* %bar, i64 %inc + store i32* %tmp3, i32** %ptr + ret <4 x i32> %tmp2 +} + +define <2 x i32> @test_v2i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <2 x i32> %A) { +; CHECK-LABEL: test_v2i32_post_imm_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], #4 + %tmp1 = load i32* %bar + %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1 + %tmp3 = getelementptr i32* %bar, i64 1 + store i32* %tmp3, i32** %ptr + ret <2 x i32> %tmp2 +} + +define <2 x i32> @test_v2i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <2 x i32> %A) { +; CHECK-LABEL: test_v2i32_post_reg_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load i32* %bar + %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1 + %tmp3 = getelementptr i32* %bar, i64 %inc + store i32* %tmp3, i32** %ptr + ret <2 x i32> %tmp2 +} + +define <2 x i64> @test_v2i64_post_imm_ld1lane(i64* %bar, i64** %ptr, <2 x i64> %A) { +; CHECK-LABEL: test_v2i64_post_imm_ld1lane: +; CHECK: ld1.d { v0 }[1], [x0], #8 + %tmp1 = load i64* %bar + %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1 + %tmp3 = getelementptr i64* %bar, i64 1 + store i64* %tmp3, i64** %ptr + ret <2 x i64> %tmp2 +} + +define <2 x i64> @test_v2i64_post_reg_ld1lane(i64* %bar, i64** %ptr, i64 %inc, <2 x i64> %A) { +; CHECK-LABEL: test_v2i64_post_reg_ld1lane: +; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load i64* %bar + %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1 + %tmp3 = getelementptr i64* %bar, i64 %inc + store i64* %tmp3, i64** %ptr + ret <2 x i64> %tmp2 +} + +define <4 x float> @test_v4f32_post_imm_ld1lane(float* %bar, float** %ptr, <4 x float> %A) { +; CHECK-LABEL: test_v4f32_post_imm_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], #4 + %tmp1 = load float* %bar + %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1 + %tmp3 = getelementptr float* %bar, i64 1 + store float* %tmp3, float** %ptr + ret <4 x float> %tmp2 +} + +define <4 x float> @test_v4f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <4 x float> %A) { +; CHECK-LABEL: test_v4f32_post_reg_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load float* %bar + %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1 + %tmp3 = getelementptr float* %bar, i64 %inc + store float* %tmp3, float** %ptr + ret <4 x float> %tmp2 +} + +define <2 x float> @test_v2f32_post_imm_ld1lane(float* %bar, float** %ptr, <2 x float> %A) { +; CHECK-LABEL: test_v2f32_post_imm_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], #4 + %tmp1 = load float* %bar + %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1 + %tmp3 = getelementptr float* %bar, i64 1 + store float* %tmp3, float** %ptr + ret <2 x float> %tmp2 +} + +define <2 x float> @test_v2f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <2 x float> %A) { +; CHECK-LABEL: test_v2f32_post_reg_ld1lane: +; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load float* %bar + %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1 + %tmp3 = getelementptr float* %bar, i64 %inc + store float* %tmp3, float** %ptr + ret <2 x float> %tmp2 +} + +define <2 x double> @test_v2f64_post_imm_ld1lane(double* %bar, double** %ptr, <2 x double> %A) { +; CHECK-LABEL: test_v2f64_post_imm_ld1lane: +; CHECK: ld1.d { v0 }[1], [x0], #8 + %tmp1 = load double* %bar + %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1 + %tmp3 = getelementptr double* %bar, i64 1 + store double* %tmp3, double** %ptr + ret <2 x double> %tmp2 +} + +define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i64 %inc, <2 x double> %A) { +; CHECK-LABEL: test_v2f64_post_reg_ld1lane: +; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}} + %tmp1 = load double* %bar + %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1 + %tmp3 = getelementptr double* %bar, i64 %inc + store double* %tmp3, double** %ptr + ret <2 x double> %tmp2 +} \ No newline at end of file