diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -247,10 +247,16 @@ { ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false}, { ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq16OddPseudoWB_fixed, ARM::VLD2DUPd16x2wb_fixed, true, true, false, OddDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq16OddPseudoWB_register, ARM::VLD2DUPd16x2wb_register, true, true, true, OddDblSpc, 2, 4 ,false}, { ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false}, { ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq32OddPseudoWB_fixed, ARM::VLD2DUPd32x2wb_fixed, true, true, false, OddDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq32OddPseudoWB_register, ARM::VLD2DUPd32x2wb_register, true, true, true, OddDblSpc, 2, 2 ,false}, { ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false}, { ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false}, +{ ARM::VLD2DUPq8OddPseudoWB_fixed, ARM::VLD2DUPd8x2wb_fixed, true, true, false, OddDblSpc, 2, 8 ,false}, +{ ARM::VLD2DUPq8OddPseudoWB_register, ARM::VLD2DUPd8x2wb_register, true, true, true, OddDblSpc, 2, 8 ,false}, { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, @@ -281,10 +287,13 @@ { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true}, { ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true}, { ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true}, +{ ARM::VLD3DUPq16OddPseudo_UPD, ARM::VLD3DUPq16_UPD, true, true, true, OddDblSpc, 3, 4 ,true}, { ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true}, { ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true}, +{ ARM::VLD3DUPq32OddPseudo_UPD, ARM::VLD3DUPq32_UPD, true, true, true, OddDblSpc, 3, 2 ,true}, { ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true}, { ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true}, +{ ARM::VLD3DUPq8OddPseudo_UPD, ARM::VLD3DUPq8_UPD, true, true, true, OddDblSpc, 3, 8 ,true}, { ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true}, { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, @@ -322,10 +331,13 @@ { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true}, { ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true}, { ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true}, +{ ARM::VLD4DUPq16OddPseudo_UPD, ARM::VLD4DUPq16_UPD, true, true, true, OddDblSpc, 4, 4 ,true}, { ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true}, { ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true}, +{ ARM::VLD4DUPq32OddPseudo_UPD, ARM::VLD4DUPq32_UPD, true, true, true, OddDblSpc, 4, 2 ,true}, { ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true}, { ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true}, +{ ARM::VLD4DUPq8OddPseudo_UPD, ARM::VLD4DUPq8_UPD, true, true, true, OddDblSpc, 4, 8 ,true}, { ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true}, { ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, @@ -567,9 +579,18 @@ bool DstIsDead = MI.getOperand(OpIdx).isDead(); Register DstReg = MI.getOperand(OpIdx++).getReg(); - if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 || - TableEntry->RealOpc == ARM::VLD2DUPd16x2 || - TableEntry->RealOpc == ARM::VLD2DUPd32x2) { + + bool IsVLD2DUP = TableEntry->RealOpc == ARM::VLD2DUPd8x2 || + TableEntry->RealOpc == ARM::VLD2DUPd16x2 || + TableEntry->RealOpc == ARM::VLD2DUPd32x2 || + TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_fixed || + TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_fixed || + TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_fixed || + TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_register || + TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_register || + TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_register; + + if (IsVLD2DUP) { unsigned SubRegIndex; if (RegSpc == EvenDblSpc) { SubRegIndex = ARM::dsub_0; @@ -617,7 +638,10 @@ TableEntry->RealOpc == ARM::VLD1d8Twb_fixed || TableEntry->RealOpc == ARM::VLD1d16Twb_fixed || TableEntry->RealOpc == ARM::VLD1d32Twb_fixed || - TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) { + TableEntry->RealOpc == ARM::VLD1d64Twb_fixed || + TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_fixed || + TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_fixed || + TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_fixed) { assert(AM6Offset.getReg() == 0 && "A fixed writing-back pseudo instruction provides an offset " "register!"); @@ -630,9 +654,7 @@ // has an extra operand that is a use of the super-register. Record the // operand index and skip over it. unsigned SrcOpIdx = 0; - if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 && - TableEntry->RealOpc != ARM::VLD2DUPd16x2 && - TableEntry->RealOpc != ARM::VLD2DUPd32x2) { + if (!IsVLD2DUP) { if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || RegSpc == SingleHighTSpc) @@ -2697,18 +2719,30 @@ case ARM::VLD2DUPq16OddPseudo: case ARM::VLD2DUPq32EvenPseudo: case ARM::VLD2DUPq32OddPseudo: + case ARM::VLD2DUPq8OddPseudoWB_fixed: + case ARM::VLD2DUPq8OddPseudoWB_register: + case ARM::VLD2DUPq16OddPseudoWB_fixed: + case ARM::VLD2DUPq16OddPseudoWB_register: + case ARM::VLD2DUPq32OddPseudoWB_fixed: + case ARM::VLD2DUPq32OddPseudoWB_register: case ARM::VLD3DUPq8EvenPseudo: case ARM::VLD3DUPq8OddPseudo: case ARM::VLD3DUPq16EvenPseudo: case ARM::VLD3DUPq16OddPseudo: case ARM::VLD3DUPq32EvenPseudo: case ARM::VLD3DUPq32OddPseudo: + case ARM::VLD3DUPq8OddPseudo_UPD: + case ARM::VLD3DUPq16OddPseudo_UPD: + case ARM::VLD3DUPq32OddPseudo_UPD: case ARM::VLD4DUPq8EvenPseudo: case ARM::VLD4DUPq8OddPseudo: case ARM::VLD4DUPq16EvenPseudo: case ARM::VLD4DUPq16OddPseudo: case ARM::VLD4DUPq32EvenPseudo: case ARM::VLD4DUPq32OddPseudo: + case ARM::VLD4DUPq8OddPseudo_UPD: + case ARM::VLD4DUPq16OddPseudo_UPD: + case ARM::VLD4DUPq32OddPseudo_UPD: ExpandVLD(MBBI); return true; diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1972,6 +1972,9 @@ case ARM::VLD2DUPd8wb_fixed : return true; case ARM::VLD2DUPd16wb_fixed : return true; case ARM::VLD2DUPd32wb_fixed : return true; + case ARM::VLD2DUPq8OddPseudoWB_fixed: return true; + case ARM::VLD2DUPq16OddPseudoWB_fixed: return true; + case ARM::VLD2DUPq32OddPseudoWB_fixed: return true; } } @@ -2035,6 +2038,9 @@ case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register; case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register; case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register; + case ARM::VLD2DUPq8OddPseudoWB_fixed: return ARM::VLD2DUPq8OddPseudoWB_register; + case ARM::VLD2DUPq16OddPseudoWB_fixed: return ARM::VLD2DUPq16OddPseudoWB_register; + case ARM::VLD2DUPq32OddPseudoWB_fixed: return ARM::VLD2DUPq32OddPseudoWB_register; case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register; case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register; @@ -2987,52 +2993,48 @@ SDValue Pred = getAL(CurDAG, dl); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SDNode *VLdDup; - if (is64BitVector || NumVecs == 1) { - SmallVector Ops; - Ops.push_back(MemAddr); - Ops.push_back(Align); - unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] : - QOpcodes0[OpcodeIndex]; - if (isUpdating) { - // fixed-stride update instructions don't have an explicit writeback - // operand. It's implicit in the opcode itself. - SDValue Inc = N->getOperand(2); - bool IsImmUpdate = - isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); - if (NumVecs <= 2 && !IsImmUpdate) - Opc = getVLDSTRegisterUpdateOpcode(Opc); - if (!IsImmUpdate) - Ops.push_back(Inc); - // FIXME: VLD3 and VLD4 haven't been updated to that form yet. - else if (NumVecs > 2) + SmallVector Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] + : (NumVecs == 1) ? QOpcodes0[OpcodeIndex] + : QOpcodes1[OpcodeIndex]; + if (isUpdating) { + SDValue Inc = N->getOperand(2); + bool IsImmUpdate = + isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); + if (IsImmUpdate) { + if (!isVLDfixed(Opc)) Ops.push_back(Reg0); + } else { + if (isVLDfixed(Opc)) + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); } - Ops.push_back(Pred); - Ops.push_back(Reg0); - Ops.push_back(Chain); - VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + } + if (is64BitVector || NumVecs == 1) { + // Double registers and VLD1 quad registers are directly supported. } else if (NumVecs == 2) { - const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain }; - SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], - dl, ResTys, OpsA); - + const SDValue OpsA[] = {MemAddr, Align, Pred, Reg0, Chain}; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy, + MVT::Other, OpsA); Chain = SDValue(VLdA, 1); - const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain }; - VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); } else { - SDValue ImplDef = - SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); - const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain }; - SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], - dl, ResTys, OpsA); - - SDValue SuperReg = SDValue(VLdA, 0); + SDValue ImplDef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); + const SDValue OpsA[] = {MemAddr, Align, ImplDef, Pred, Reg0, Chain}; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy, + MVT::Other, OpsA); + Ops.push_back(SDValue(VLdA, 0)); Chain = SDValue(VLdA, 1); - const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain }; - VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + + SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + // Transfer memoperands. MachineMemOperand *MemOp = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(VLdDup), {MemOp}); @@ -4192,26 +4194,47 @@ } case ARMISD::VLD2DUP_UPD: { - static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, - ARM::VLD2DUPd16wb_fixed, - ARM::VLD2DUPd32wb_fixed }; - SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes); + static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8wb_fixed, + ARM::VLD2DUPd16wb_fixed, + ARM::VLD2DUPd32wb_fixed, + ARM::VLD1q64wb_fixed }; + static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo, + ARM::VLD2DUPq16EvenPseudo, + ARM::VLD2DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudoWB_fixed, + ARM::VLD2DUPq16OddPseudoWB_fixed, + ARM::VLD2DUPq32OddPseudoWB_fixed }; + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, DOpcodes, QOpcodes0, QOpcodes1); return; } case ARMISD::VLD3DUP_UPD: { - static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, - ARM::VLD3DUPd16Pseudo_UPD, - ARM::VLD3DUPd32Pseudo_UPD }; - SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes); + static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, + ARM::VLD3DUPd16Pseudo_UPD, + ARM::VLD3DUPd32Pseudo_UPD, + ARM::VLD1d64TPseudoWB_fixed }; + static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo, + ARM::VLD3DUPq16EvenPseudo, + ARM::VLD3DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo_UPD, + ARM::VLD3DUPq16OddPseudo_UPD, + ARM::VLD3DUPq32OddPseudo_UPD }; + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case ARMISD::VLD4DUP_UPD: { - static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, - ARM::VLD4DUPd16Pseudo_UPD, - ARM::VLD4DUPd32Pseudo_UPD }; - SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes); + static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, + ARM::VLD4DUPd16Pseudo_UPD, + ARM::VLD4DUPd32Pseudo_UPD, + ARM::VLD1d64QPseudoWB_fixed }; + static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo, + ARM::VLD4DUPq16EvenPseudo, + ARM::VLD4DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo_UPD, + ARM::VLD4DUPq16OddPseudo_UPD, + ARM::VLD4DUPq32OddPseudo_UPD }; + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14774,12 +14774,12 @@ NumVecs = 3; hasAlignment = false; break; case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD; NumVecs = 4; hasAlignment = false; break; - case Intrinsic::arm_neon_vld2dup: - case Intrinsic::arm_neon_vld3dup: - case Intrinsic::arm_neon_vld4dup: - // TODO: Support updating VLDxDUP nodes. For now, we just skip - // combining base updates for such intrinsics. - continue; + case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD; + NumVecs = 4; break; case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; NumVecs = 2; isLaneOp = true; break; case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; @@ -14833,8 +14833,12 @@ VecTy = N->getOperand(1).getValueType(); } + bool isVLDDUPOp = + NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD || + NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD; + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; - if (isLaneOp) + if (isLaneOp || isVLDDUPOp) NumBytes /= VecTy.getVectorNumElements(); // If the increment is a constant, it must match the memory ref size. diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -1534,6 +1534,13 @@ defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes, addrmode6dupalign64>; +def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudo, Sched<[WriteVLD2]>; + // VLD3DUP : Vector Load (single 3-element structure to all lanes) class VLD3DUP op7_4, string Dt> : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), @@ -1587,6 +1594,10 @@ def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq8OddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq16OddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq32OddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD2]>; + // VLD4DUP : Vector Load (single 4-element structure to all lanes) class VLD4DUP op7_4, string Dt> : NLdSt<1, 0b10, 0b1111, op7_4, @@ -1641,6 +1652,10 @@ def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq8OddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq16OddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq32OddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD2]>; + } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { diff --git a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll --- a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll +++ b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll @@ -1,43 +1,495 @@ ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \ ; RUN: -asm-verbose=false | FileCheck %s +%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> } +%struct.uint16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } +%struct.uint16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } + %struct.uint32x2x2_t = type { <2 x i32>, <2 x i32> } %struct.uint32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } %struct.uint32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } +%struct.uint64x1x2_t = type { <1 x i64>, <1 x i64> } +%struct.uint64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> } +%struct.uint64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } + +%struct.uint8x8x2_t = type { <8 x i8>, <8 x i8> } +%struct.uint8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } +%struct.uint8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } + +%struct.uint16x8x2_t = type { <8 x i16>, <8 x i16> } +%struct.uint16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } +%struct.uint16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } + +%struct.uint32x4x2_t = type { <4 x i32>, <4 x i32> } +%struct.uint32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } +%struct.uint32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } + +%struct.uint8x16x2_t = type { <16 x i8>, <16 x i8> } +%struct.uint8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> } +%struct.uint8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } + +declare %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8*, i32) +declare %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8*, i32) declare %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8*, i32) +declare %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8*, i32) + +declare %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8*, i32) +declare %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8*, i32) declare %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8*, i32) +declare %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8*, i32) + +declare %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8*, i32) +declare %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8*, i32) declare %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8*, i32) +declare %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8*, i32) + +declare %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8*, i32) +declare %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8*, i32) +declare %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8*, i32) + +declare %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8*, i32) +declare %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8*, i32) +declare %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8*, i32) + +declare %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8*, i32) +declare %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8*, i32) +declare %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8*, i32) + +define i8* @test_vld2_dup_u16_update(%struct.uint16x4x2_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld2_dup_u16_update: +; CHECK: vld2.16 {d16[], d17[]}, [r1]! +entry: + %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* %src, i32 2) + store %struct.uint16x4x2_t %tmp, %struct.uint16x4x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 4 + ret i8* %updated_src +} + +define i8* @test_vld2_dup_u16_update_reg(%struct.uint16x4x2_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld2_dup_u16_update_reg: +; CHECK: vld2.16 {d16[], d17[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* %src, i32 2) + store %struct.uint16x4x2_t %tmp, %struct.uint16x4x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} -; CHECK-LABEL: test_vld2_dup_update -; CHECK: vld2.32 {d16[], d17[]}, {{\[}}[[SRC_R:r[0-9]+]]] -; CHECK: add {{r[0-9]+|lr}}, [[SRC_R]], #4 define i8* @test_vld2_dup_update(%struct.uint32x2x2_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld2_dup_update: +; CHECK: vld2.32 {d16[], d17[]}, [r1]! entry: %tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* %src, i32 4) store %struct.uint32x2x2_t %tmp, %struct.uint32x2x2_t* %dest, align 8 - %updated_src = getelementptr inbounds i8, i8* %src, i32 4 + %updated_src = getelementptr inbounds i8, i8* %src, i32 8 ret i8* %updated_src } -; CHECK-LABEL: test_vld3_dup_update -; CHECK: vld3.32 {d16[], d17[], d18[]}, {{\[}}[[SRC_R:r[0-9]+]]] -; CHECK: add {{r[0-9]+|lr}}, [[SRC_R]], #4 -define i8* @test_vld3_dup_update(%struct.uint32x2x3_t* %dest, i8* %src) { +define i8* @test_vld2_dup_update_reg(%struct.uint32x2x2_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld2_dup_update_reg: +; CHECK: vld2.32 {d16[], d17[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* %src, i32 4) + store %struct.uint32x2x2_t %tmp, %struct.uint32x2x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld2_dup_u64_update(%struct.uint64x1x2_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld2_dup_u64_update: +; CHECK: vld1.64 {d16, d17}, [r1:64]! +entry: + %tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* %src, i32 8) + store %struct.uint64x1x2_t %tmp, %struct.uint64x1x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 16 + ret i8* %updated_src +} + +define i8* @test_vld2_dup_u64_update_reg(%struct.uint64x1x2_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld2_dup_u64_update_reg: +; CHECK: vld1.64 {d16, d17}, [r1:64], r2 +entry: + %tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* %src, i32 8) + store %struct.uint64x1x2_t %tmp, %struct.uint64x1x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld2_dup_u8_update(%struct.uint8x8x2_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld2_dup_u8_update: +; CHECK: vld2.8 {d16[], d17[]}, [r1]! +entry: + %tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1) + store %struct.uint8x8x2_t %tmp, %struct.uint8x8x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 2 + ret i8* %updated_src +} + +define i8* @test_vld2_dup_u8_update_reg(%struct.uint8x8x2_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld2_dup_u8_update_reg: +; CHECK: vld2.8 {d16[], d17[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1) + store %struct.uint8x8x2_t %tmp, %struct.uint8x8x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u16_update(%struct.uint16x4x3_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld3_dup_u16_update: +; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1]! +entry: + %tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* %src, i32 2) + store %struct.uint16x4x3_t %tmp, %struct.uint16x4x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 6 + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u16_update_reg(%struct.uint16x4x3_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld3_dup_u16_update_reg: +; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* %src, i32 2) + store %struct.uint16x4x3_t %tmp, %struct.uint16x4x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u32_update(%struct.uint32x2x3_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld3_dup_u32_update: +; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1]! entry: %tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* %src, i32 4) store %struct.uint32x2x3_t %tmp, %struct.uint32x2x3_t* %dest, align 8 - %updated_src = getelementptr inbounds i8, i8* %src, i32 4 + %updated_src = getelementptr inbounds i8, i8* %src, i32 12 + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u32_update_reg(%struct.uint32x2x3_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld3_dup_u32_update_reg: +; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* %src, i32 4) + store %struct.uint32x2x3_t %tmp, %struct.uint32x2x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u64_update(%struct.uint64x1x3_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld3_dup_u64_update: +; CHECK: vld1.64 {d16, d17, d18}, [r1]! +entry: + %tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* %src, i32 8) + store %struct.uint64x1x3_t %tmp, %struct.uint64x1x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 24 + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u64_update_reg(%struct.uint64x1x3_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld3_dup_u64_update_reg: +; CHECK: vld1.64 {d16, d17, d18}, [r1], r2 +entry: + %tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* %src, i32 8) + store %struct.uint64x1x3_t %tmp, %struct.uint64x1x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u8_update(%struct.uint8x8x3_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld3_dup_u8_update: +; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1]! +entry: + %tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1) + store %struct.uint8x8x3_t %tmp, %struct.uint8x8x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 3 + ret i8* %updated_src +} + +define i8* @test_vld3_dup_u8_update_reg(%struct.uint8x8x3_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld3_dup_u8_update_reg: +; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1) + store %struct.uint8x8x3_t %tmp, %struct.uint8x8x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld4_dup_u16_update(%struct.uint16x4x4_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld4_dup_u16_update: +; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]! +entry: + %tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* %src, i32 2) + store %struct.uint16x4x4_t %tmp, %struct.uint16x4x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 8 ret i8* %updated_src } -; CHECK-LABEL: test_vld4_dup_update -; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, {{\[}}[[SRC_R:r[0-9]+]]] -; CHECK: add {{r[0-9]+|lr}}, [[SRC_R]], #4 -define i8* @test_vld4_dup_update(%struct.uint32x2x4_t* %dest, i8* %src) { +define i8* @test_vld4_dup_u16_update_reg(%struct.uint16x4x4_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld4_dup_u16_update_reg: +; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* %src, i32 2) + store %struct.uint16x4x4_t %tmp, %struct.uint16x4x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld4_dup_u32_update(%struct.uint32x2x4_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld4_dup_u32_update: +; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1]! entry: %tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* %src, i32 4) store %struct.uint32x2x4_t %tmp, %struct.uint32x2x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 16 + ret i8* %updated_src +} + +define i8* @test_vld4_dup_u32_update_reg(%struct.uint32x2x4_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld4_dup_u32_update_reg: +; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* %src, i32 4) + store %struct.uint32x2x4_t %tmp, %struct.uint32x2x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld4_dup_u64_update(%struct.uint64x1x4_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld4_dup_u64_update: +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64]! +entry: + %tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* %src, i32 8) + store %struct.uint64x1x4_t %tmp, %struct.uint64x1x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 32 + ret i8* %updated_src +} + +define i8* @test_vld4_dup_u64_update_reg(%struct.uint64x1x4_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld4_dup_u64_update_reg: +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64], r2 +entry: + %tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* %src, i32 8) + store %struct.uint64x1x4_t %tmp, %struct.uint64x1x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld4_dup_u8_update(%struct.uint8x8x4_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld4_dup_u8_update: +; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1]! +entry: + %tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1) + store %struct.uint8x8x4_t %tmp, %struct.uint8x8x4_t* %dest, align 8 %updated_src = getelementptr inbounds i8, i8* %src, i32 4 ret i8* %updated_src } + +define i8* @test_vld4_dup_u8_update_reg(%struct.uint8x8x4_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld4_dup_u8_update_reg: +; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1) + store %struct.uint8x8x4_t %tmp, %struct.uint8x8x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld2q_dup_u16_update(%struct.uint16x8x2_t* %dest, i8* %src, <8 x i16>* %dest0) { +; CHECK-LABEL: test_vld2q_dup_u16_update: +; CHECK: vld2.16 {d16[], d18[]}, [r1] +; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1]! +entry: + %tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* %src, i32 2) + store %struct.uint16x8x2_t %tmp, %struct.uint16x8x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 4 + ret i8* %updated_src +} + +define i8* @test_vld2q_dup_u16_update_reg(%struct.uint16x8x2_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld2q_dup_u16_update_reg: +; CHECK: vld2.16 {d16[], d18[]}, [r1] +; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* %src, i32 2) + store %struct.uint16x8x2_t %tmp, %struct.uint16x8x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld2q_dup_u32_update(%struct.uint32x4x2_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld2q_dup_u32_update: +; CHECK: vld2.32 {d16[], d18[]}, [r1] +; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1]! +entry: + %tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* %src, i32 4) + store %struct.uint32x4x2_t %tmp, %struct.uint32x4x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 8 + ret i8* %updated_src +} + +define i8* @test_vld2q_dup_u32_update_reg(%struct.uint32x4x2_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld2q_dup_u32_update_reg: +; CHECK: vld2.32 {d16[], d18[]}, [r1] +; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* %src, i32 4) + store %struct.uint32x4x2_t %tmp, %struct.uint32x4x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld2q_dup_u8_update(%struct.uint8x16x2_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld2q_dup_u8_update: +; CHECK: vld2.8 {d16[], d18[]}, [r1] +; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1]! +entry: + %tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1) + store %struct.uint8x16x2_t %tmp, %struct.uint8x16x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 2 + ret i8* %updated_src +} + +define i8* @test_vld2q_dup_u8_update_reg(%struct.uint8x16x2_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld2q_dup_u8_update_reg: +; CHECK: vld2.8 {d16[], d18[]}, [r1] +; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1) + store %struct.uint8x16x2_t %tmp, %struct.uint8x16x2_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld3q_dup_u16_update(%struct.uint16x8x3_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld3q_dup_u16_update: +; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1] +; CHECK: vld3.16 {d17[], d19[], d21[]}, [r1]! +entry: + %tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* %src, i32 2) + store %struct.uint16x8x3_t %tmp, %struct.uint16x8x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 6 + ret i8* %updated_src +} + +define i8* @test_vld3q_dup_u16_update_reg(%struct.uint16x8x3_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld3q_dup_u16_update_reg: +; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1] +; CHECK-NEXT: vld3.16 {d17[], d19[], d21[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* %src, i32 2) + store %struct.uint16x8x3_t %tmp, %struct.uint16x8x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld3q_dup_u32_update(%struct.uint32x4x3_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld3q_dup_u32_update: +; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1] +; CHECK: vld3.32 {d17[], d19[], d21[]}, [r1]! +entry: + %tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* %src, i32 4) + store %struct.uint32x4x3_t %tmp, %struct.uint32x4x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 12 + ret i8* %updated_src +} + +define i8* @test_vld3q_dup_u32_update_reg(%struct.uint32x4x3_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld3q_dup_u32_update_reg: +; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1] +; CHECK-NEXT: vld3.32 {d17[], d19[], d21[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* %src, i32 4) + store %struct.uint32x4x3_t %tmp, %struct.uint32x4x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld3q_dup_u8_update(%struct.uint8x16x3_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld3q_dup_u8_update: +; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1] +; CHECK: vld3.8 {d17[], d19[], d21[]}, [r1]! +entry: + %tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1) + store %struct.uint8x16x3_t %tmp, %struct.uint8x16x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 3 + ret i8* %updated_src +} + +define i8* @test_vld3q_dup_u8_update_reg(%struct.uint8x16x3_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld3q_dup_u8_update_reg: +; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1] +; CHECK-NEXT: vld3.8 {d17[], d19[], d21[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1) + store %struct.uint8x16x3_t %tmp, %struct.uint8x16x3_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld4q_dup_u16_update(%struct.uint16x8x4_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld4q_dup_u16_update: +; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]! +entry: + %tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* %src, i32 2) + store %struct.uint16x8x4_t %tmp, %struct.uint16x8x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 8 + ret i8* %updated_src +} + +define i8* @test_vld4q_dup_u16_update_reg(%struct.uint16x8x4_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld4q_dup_u16_update_reg: +; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK-NEXT: vld4.16 {d17[], d19[], d21[], d23[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* %src, i32 2) + store %struct.uint16x8x4_t %tmp, %struct.uint16x8x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld4q_dup_u32_update(%struct.uint32x4x4_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld4q_dup_u32_update: +; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]! +entry: + %tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* %src, i32 4) + store %struct.uint32x4x4_t %tmp, %struct.uint32x4x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 16 + ret i8* %updated_src +} + +define i8* @test_vld4q_dup_u32_update_reg(%struct.uint32x4x4_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld4q_dup_u32_update_reg: +; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK-NEXT: vld4.32 {d17[], d19[], d21[], d23[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* %src, i32 4) + store %struct.uint32x4x4_t %tmp, %struct.uint32x4x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +} + +define i8* @test_vld4q_dup_u8_update(%struct.uint8x16x4_t* %dest, i8* %src) { +; CHECK-LABEL: test_vld4q_dup_u8_update: +; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]! +entry: + %tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1) + store %struct.uint8x16x4_t %tmp, %struct.uint8x16x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 4 + ret i8* %updated_src +} + +define i8* @test_vld4q_dup_u8_update_reg(%struct.uint8x16x4_t* %dest, i8* %src, i32 %inc) { +; CHECK-LABEL: test_vld4q_dup_u8_update_reg: +; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK-NEXT: vld4.8 {d17[], d19[], d21[], d23[]}, [r1], r2 +entry: + %tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1) + store %struct.uint8x16x4_t %tmp, %struct.uint8x16x4_t* %dest, align 8 + %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc + ret i8* %updated_src +}