Index: llvm/trunk/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsARM.td +++ llvm/trunk/include/llvm/IR/IntrinsicsARM.td @@ -652,6 +652,20 @@ LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; +// Vector load N-element structure to all lanes. +// Source operands are the address and alignment. +def int_arm_neon_vld2dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_anyptr_ty, llvm_i32_ty], + [IntrReadMem, IntrArgMemOnly]>; +def int_arm_neon_vld3dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>], + [llvm_anyptr_ty, llvm_i32_ty], + [IntrReadMem, IntrArgMemOnly]>; +def int_arm_neon_vld4dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyptr_ty, llvm_i32_ty], + [IntrReadMem, IntrArgMemOnly]>; + // Interleaving vector stores from N-element structures. // Source operands are: the address, the N vectors, and the alignment. def int_arm_neon_vst1 : Intrinsic<[], Index: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4310,12 +4310,30 @@ case ARM::VLD2DUPd8wb_register: case ARM::VLD2DUPd16wb_register: case ARM::VLD2DUPd32wb_register: + case ARM::VLD2DUPq8EvenPseudo: + case ARM::VLD2DUPq8OddPseudo: + case ARM::VLD2DUPq16EvenPseudo: + case ARM::VLD2DUPq16OddPseudo: + case ARM::VLD2DUPq32EvenPseudo: + case ARM::VLD2DUPq32OddPseudo: + case ARM::VLD3DUPq8EvenPseudo: + case ARM::VLD3DUPq8OddPseudo: + case ARM::VLD3DUPq16EvenPseudo: + case ARM::VLD3DUPq16OddPseudo: + case ARM::VLD3DUPq32EvenPseudo: + case ARM::VLD3DUPq32OddPseudo: case ARM::VLD4DUPd8Pseudo: case ARM::VLD4DUPd16Pseudo: case ARM::VLD4DUPd32Pseudo: case ARM::VLD4DUPd8Pseudo_UPD: case ARM::VLD4DUPd16Pseudo_UPD: case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD4DUPq8EvenPseudo: + case ARM::VLD4DUPq8OddPseudo: + case ARM::VLD4DUPq16EvenPseudo: + case ARM::VLD4DUPq16OddPseudo: + case ARM::VLD4DUPq32EvenPseudo: + case ARM::VLD4DUPq32OddPseudo: case ARM::VLD1LNq8Pseudo: case ARM::VLD1LNq16Pseudo: case ARM::VLD1LNq32Pseudo: Index: llvm/trunk/lib/Target/ARM/ARMExpandPseudoInsts.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ llvm/trunk/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -186,6 +186,13 @@ { ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false}, { ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false}, +{ ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false}, +{ ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false}, + { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true}, @@ -213,6 +220,12 @@ { ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true}, { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true}, { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true}, +{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true}, +{ ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true}, +{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true}, +{ ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true}, +{ ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true}, +{ ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true}, { ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true}, { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, @@ -248,6 +261,12 @@ { ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true}, { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true}, { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true}, +{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true}, +{ ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true}, +{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true}, +{ ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true}, +{ ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true}, +{ ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true}, { ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true}, { ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, @@ -463,15 +482,31 @@ bool DstIsDead = MI.getOperand(OpIdx).isDead(); unsigned DstReg = MI.getOperand(OpIdx++).getReg(); - unsigned D0, D1, D2, D3; - GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); - MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 1 && TableEntry->copyAllListRegs) - MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 2 && TableEntry->copyAllListRegs) - MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 3 && TableEntry->copyAllListRegs) - MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 || + TableEntry->RealOpc == ARM::VLD2DUPd16x2 || + TableEntry->RealOpc == ARM::VLD2DUPd32x2) { + unsigned SubRegIndex; + if (RegSpc == EvenDblSpc) { + SubRegIndex = ARM::dsub_0; + } else { + assert(RegSpc == OddDblSpc && "Unexpected spacing!"); + SubRegIndex = ARM::dsub_1; + } + unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex); + unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0, + &ARM::DPairSpcRegClass); + MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead)); + } else { + unsigned D0, D1, D2, D3; + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1 && TableEntry->copyAllListRegs) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2 && TableEntry->copyAllListRegs) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3 && TableEntry->copyAllListRegs) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + } if (TableEntry->isUpdating) MIB.add(MI.getOperand(OpIdx++)); @@ -510,10 +545,14 @@ // has an extra operand that is a use of the super-register. Record the // operand index and skip over it. unsigned SrcOpIdx = 0; - if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || - RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || - RegSpc == SingleHighTSpc) - SrcOpIdx = OpIdx++; + if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 && + TableEntry->RealOpc != ARM::VLD2DUPd16x2 && + TableEntry->RealOpc != ARM::VLD2DUPd32x2) { + if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || + RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || + RegSpc == SingleHighTSpc) + SrcOpIdx = OpIdx++; + } // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); @@ -1674,6 +1713,24 @@ case ARM::VLD4DUPd8Pseudo_UPD: case ARM::VLD4DUPd16Pseudo_UPD: case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD2DUPq8EvenPseudo: + case ARM::VLD2DUPq8OddPseudo: + case ARM::VLD2DUPq16EvenPseudo: + case ARM::VLD2DUPq16OddPseudo: + case ARM::VLD2DUPq32EvenPseudo: + case ARM::VLD2DUPq32OddPseudo: + case ARM::VLD3DUPq8EvenPseudo: + case ARM::VLD3DUPq8OddPseudo: + case ARM::VLD3DUPq16EvenPseudo: + case ARM::VLD3DUPq16OddPseudo: + case ARM::VLD3DUPq32EvenPseudo: + case ARM::VLD3DUPq32OddPseudo: + case ARM::VLD4DUPq8EvenPseudo: + case ARM::VLD4DUPq8OddPseudo: + case ARM::VLD4DUPq16EvenPseudo: + case ARM::VLD4DUPq16OddPseudo: + case ARM::VLD4DUPq32EvenPseudo: + case ARM::VLD4DUPq32OddPseudo: ExpandVLD(MBBI); return true; Index: llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -203,10 +203,11 @@ /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used - /// for loading D registers. (Q registers are not supported.) - void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, - const uint16_t *QOpcodes = nullptr); + /// for loading D registers. + void SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, + unsigned NumVecs, const uint16_t *DOpcodes, + const uint16_t *QOpcodes0 = nullptr, + const uint16_t *QOpcodes1 = nullptr); /// Try to select SBFX/UBFX instructions for ARM. bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned); @@ -1747,7 +1748,9 @@ SDLoc dl(N); SDValue MemAddr, Align; - unsigned AddrOpIdx = isUpdating ? 1 : 2; + bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating + // nodes are not intrinsics. + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; @@ -1883,7 +1886,9 @@ SDLoc dl(N); SDValue MemAddr, Align; - unsigned AddrOpIdx = isUpdating ? 1 : 2; + bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating + // nodes are not intrinsics. + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; @@ -2033,7 +2038,9 @@ SDLoc dl(N); SDValue MemAddr, Align; - unsigned AddrOpIdx = isUpdating ? 1 : 2; + bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating + // nodes are not intrinsics. + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; @@ -2149,21 +2156,22 @@ CurDAG->RemoveDeadNode(N); } -void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, +void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, + bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, - const uint16_t *QOpcodes) { + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast(N)->getMemOperand(); - SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); + bool is64BitVector = VT.is64BitVector(); unsigned Alignment = 0; if (NumVecs != 3) { @@ -2180,49 +2188,84 @@ } Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32); - unsigned Opc; + unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld-dup type"); - case MVT::v8i8: Opc = DOpcodes[0]; break; - case MVT::v16i8: Opc = QOpcodes[0]; break; - case MVT::v4i16: Opc = DOpcodes[1]; break; - case MVT::v8i16: Opc = QOpcodes[1]; break; + case MVT::v8i8: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v4i16: + case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v2f32: - case MVT::v2i32: Opc = DOpcodes[2]; break; + case MVT::v2i32: case MVT::v4f32: - case MVT::v4i32: Opc = QOpcodes[2]; break; - } - - SDValue Pred = getAL(CurDAG, dl); - SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SmallVector Ops; - Ops.push_back(MemAddr); - Ops.push_back(Align); - if (isUpdating) { - // fixed-stride update instructions don't have an explicit writeback - // operand. It's implicit in the opcode itself. - SDValue Inc = N->getOperand(2); - bool IsImmUpdate = - isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); - if (NumVecs <= 2 && !IsImmUpdate) - Opc = getVLDSTRegisterUpdateOpcode(Opc); - if (!IsImmUpdate) - Ops.push_back(Inc); - // FIXME: VLD3 and VLD4 haven't been updated to that form yet. - else if (NumVecs > 2) - Ops.push_back(Reg0); + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v1f64: + case MVT::v1i64: OpcodeIndex = 3; break; } - Ops.push_back(Pred); - Ops.push_back(Reg0); - Ops.push_back(Chain); unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); + std::vector ResTys; - ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts)); + ResTys.push_back(ResTy); if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); - SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + + SDNode *VLdDup; + if (is64BitVector || NumVecs == 1) { + SmallVector Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]; + if (isUpdating) { + // fixed-stride update instructions don't have an explicit writeback + // operand. It's implicit in the opcode itself. + SDValue Inc = N->getOperand(2); + bool IsImmUpdate = + isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); + if (NumVecs <= 2 && !IsImmUpdate) + Opc = getVLDSTRegisterUpdateOpcode(Opc); + if (!IsImmUpdate) + Ops.push_back(Inc); + // FIXME: VLD3 and VLD4 haven't been updated to that form yet. + else if (NumVecs > 2) + Ops.push_back(Reg0); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + } else if (NumVecs == 2) { + const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], + dl, ResTys, OpsA); + + Chain = SDValue(VLdA, 1); + const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain }; + VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); + } else { + SDValue ImplDef = + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); + const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], + dl, ResTys, OpsA); + + SDValue SuperReg = SDValue(VLdA, 0); + Chain = SDValue(VLdA, 1); + const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain }; + VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); + } + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); cast(VLdDup)->setMemRefs(MemOp, MemOp + 1); // Extract the subregisters. @@ -2231,10 +2274,11 @@ } else { SDValue SuperReg = SDValue(VLdDup, 0); static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering"); - unsigned SubIdx = ARM::dsub_0; - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + for (unsigned Vec = 0; Vec != NumVecs; ++Vec) { ReplaceUses(SDValue(N, Vec), CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + } } ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); if (isUpdating) @@ -3066,14 +3110,14 @@ ARM::VLD1DUPd32 }; static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16, ARM::VLD1DUPq32 }; - SelectVLDDup(N, false, 1, DOpcodes, QOpcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 1, DOpcodes, QOpcodes); return; } case ARMISD::VLD2DUP: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, ARM::VLD2DUPd32 }; - SelectVLDDup(N, false, 2, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 2, Opcodes); return; } @@ -3081,7 +3125,7 @@ static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd32Pseudo }; - SelectVLDDup(N, false, 3, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 3, Opcodes); return; } @@ -3089,7 +3133,7 @@ static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd32Pseudo }; - SelectVLDDup(N, false, 4, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 4, Opcodes); return; } @@ -3100,7 +3144,7 @@ static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed, ARM::VLD1DUPq16wb_fixed, ARM::VLD1DUPq32wb_fixed }; - SelectVLDDup(N, true, 1, DOpcodes, QOpcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 1, DOpcodes, QOpcodes); return; } @@ -3108,7 +3152,7 @@ static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed, ARM::VLD2DUPd32wb_fixed }; - SelectVLDDup(N, true, 2, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes); return; } @@ -3116,7 +3160,7 @@ static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd32Pseudo_UPD }; - SelectVLDDup(N, true, 3, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes); return; } @@ -3124,7 +3168,7 @@ static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd32Pseudo_UPD }; - SelectVLDDup(N, true, 4, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes); return; } @@ -3531,6 +3575,52 @@ return; } + case Intrinsic::arm_neon_vld2dup: { + static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, + ARM::VLD2DUPd32, ARM::VLD1q64 }; + static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo, + ARM::VLD2DUPq16EvenPseudo, + ARM::VLD2DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudo, + ARM::VLD2DUPq16OddPseudo, + ARM::VLD2DUPq32OddPseudo }; + SelectVLDDup(N, /* IsIntrinsic= */ true, false, 2, + DOpcodes, QOpcodes0, QOpcodes1); + return; + } + + case Intrinsic::arm_neon_vld3dup: { + static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo, + ARM::VLD3DUPd16Pseudo, + ARM::VLD3DUPd32Pseudo, + ARM::VLD1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo, + ARM::VLD3DUPq16EvenPseudo, + ARM::VLD3DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo, + ARM::VLD3DUPq16OddPseudo, + ARM::VLD3DUPq32OddPseudo }; + SelectVLDDup(N, /* IsIntrinsic= */ true, false, 3, + DOpcodes, QOpcodes0, QOpcodes1); + return; + } + + case Intrinsic::arm_neon_vld4dup: { + static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo, + ARM::VLD4DUPd16Pseudo, + ARM::VLD4DUPd32Pseudo, + ARM::VLD1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo, + ARM::VLD4DUPq16EvenPseudo, + ARM::VLD4DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo, + ARM::VLD4DUPq16OddPseudo, + ARM::VLD4DUPq32OddPseudo }; + SelectVLDDup(N, /* IsIntrinsic= */ true, false, 4, + DOpcodes, QOpcodes0, QOpcodes1); + return; + } + case Intrinsic::arm_neon_vld2lane: { static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -12772,6 +12772,9 @@ case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vld2dup: + case Intrinsic::arm_neon_vld3dup: + case Intrinsic::arm_neon_vld4dup: case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst1x2: case Intrinsic::arm_neon_vst1x3: @@ -14066,7 +14069,10 @@ case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: - case Intrinsic::arm_neon_vld4lane: { + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vld2dup: + case Intrinsic::arm_neon_vld3dup: + case Intrinsic::arm_neon_vld4dup: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); Index: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td +++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td @@ -209,7 +209,7 @@ let ParserMethod = "parseVectorList"; let RenderMethod = "addVecListOperands"; } -def VecListDPairSpacedAllLanes : RegisterOperand { let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand; } @@ -1518,6 +1518,13 @@ def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes, addrmode6dupalign64>; +def VLD2DUPq8EvenPseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq8OddPseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq16EvenPseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq16OddPseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq32EvenPseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD2DUPq32OddPseudo : VLDQQPseudo, Sched<[WriteVLD2]>; + // ...with address register writeback: multiclass VLD2DUPWB op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode> { @@ -1578,6 +1585,13 @@ def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">; def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">; +def VLD3DUPq8EvenPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq8OddPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq16EvenPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq16OddPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq32EvenPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD3DUPq32OddPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; + // ...with address register writeback: class VLD3DUPWB op7_4, string Dt, Operand AddrMode> : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), @@ -1624,6 +1638,13 @@ def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">; def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } +def VLD4DUPq8EvenPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq8OddPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq16EvenPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq16OddPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq32EvenPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; +def VLD4DUPq32OddPseudo : VLDQQQQPseudo, Sched<[WriteVLD2]>; + // ...with address register writeback: class VLD4DUPWB op7_4, string Dt> : NLdSt<1, 0b10, 0b1111, op7_4, Index: llvm/trunk/test/CodeGen/ARM/arm-vlddup.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/arm-vlddup.ll +++ llvm/trunk/test/CodeGen/ARM/arm-vlddup.ll @@ -0,0 +1,234 @@ +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \ +; RUN: -asm-verbose=false | FileCheck %s + +%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> } +%struct.uint16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } +%struct.uint16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } + +%struct.uint32x2x2_t = type { <2 x i32>, <2 x i32> } +%struct.uint32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } +%struct.uint32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } + +%struct.uint64x1x2_t = type { <1 x i64>, <1 x i64> } +%struct.uint64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> } +%struct.uint64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } + +%struct.uint8x8x2_t = type { <8 x i8>, <8 x i8> } +%struct.uint8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } +%struct.uint8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } + +%struct.uint16x8x2_t = type { <8 x i16>, <8 x i16> } +%struct.uint16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } +%struct.uint16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } + +%struct.uint32x4x2_t = type { <4 x i32>, <4 x i32> } +%struct.uint32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } +%struct.uint32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } + +%struct.uint8x16x2_t = type { <16 x i8>, <16 x i8> } +%struct.uint8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> } +%struct.uint8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } + +declare %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8*, i32) +declare %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8*, i32) +declare %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8*, i32) +declare %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8*, i32) + +declare %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8*, i32) +declare %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8*, i32) +declare %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8*, i32) +declare %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8*, i32) + +declare %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8*, i32) +declare %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8*, i32) +declare %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8*, i32) +declare %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8*, i32) + +declare %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8*, i32) +declare %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8*, i32) +declare %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8*, i32) + +declare %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8*, i32) +declare %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8*, i32) +declare %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8*, i32) + +declare %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8*, i32) +declare %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8*, i32) +declare %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8*, i32) + +; CHECK-LABEL: test_vld2_dup_u16 +; CHECK: vld2.16 {d16[], d17[]}, [r0] +define %struct.uint16x4x2_t @test_vld2_dup_u16(i8* %src) { +entry: + %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* %src, i32 2) + ret %struct.uint16x4x2_t %tmp +} + +; CHECK-LABEL: test_vld2_dup_u32 +; CHECK: vld2.32 {d16[], d17[]}, [r0] +define %struct.uint32x2x2_t @test_vld2_dup_u32(i8* %src) { +entry: + %tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* %src, i32 4) + ret %struct.uint32x2x2_t %tmp +} + +; CHECK-LABEL: test_vld2_dup_u64 +; CHECK: vld1.64 {d16, d17}, [r0:64] +define %struct.uint64x1x2_t @test_vld2_dup_u64(i8* %src) { +entry: + %tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* %src, i32 8) + ret %struct.uint64x1x2_t %tmp +} + +; CHECK-LABEL: test_vld2_dup_u8 +; CHECK: vld2.8 {d16[], d17[]}, [r0] +define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %src) { +entry: + %tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1) + ret %struct.uint8x8x2_t %tmp +} + +; CHECK-LABEL: test_vld3_dup_u16 +; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1] +define %struct.uint16x4x3_t @test_vld3_dup_u16(i8* %src) { +entry: + %tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* %src, i32 2) + ret %struct.uint16x4x3_t %tmp +} + +; CHECK-LABEL: test_vld3_dup_u32 +; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1] +define %struct.uint32x2x3_t @test_vld3_dup_u32(i8* %src) { +entry: + %tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* %src, i32 4) + ret %struct.uint32x2x3_t %tmp +} + +; CHECK-LABEL: test_vld3_dup_u64 +; CHECK: vld1.64 {d16, d17, d18}, [r1] +define %struct.uint64x1x3_t @test_vld3_dup_u64(i8* %src) { +entry: + %tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* %src, i32 8) + ret %struct.uint64x1x3_t %tmp +} + +; CHECK-LABEL: test_vld3_dup_u8 +; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1] +define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %src) { +entry: + %tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1) + ret %struct.uint8x8x3_t %tmp +} + +; CHECK-LABEL: test_vld4_dup_u16 +; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1] +define %struct.uint16x4x4_t @test_vld4_dup_u16(i8* %src) { +entry: + %tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* %src, i32 2) + ret %struct.uint16x4x4_t %tmp +} + +; CHECK-LABEL: test_vld4_dup_u32 +; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1] +define %struct.uint32x2x4_t @test_vld4_dup_u32(i8* %src) { +entry: + %tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* %src, i32 4) + ret %struct.uint32x2x4_t %tmp +} + +; CHECK-LABEL: test_vld4_dup_u64 +; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64] +define %struct.uint64x1x4_t @test_vld4_dup_u64(i8* %src) { +entry: + %tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* %src, i32 8) + ret %struct.uint64x1x4_t %tmp +} + +; CHECK-LABEL: test_vld4_dup_u8 +; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1] +define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %src) { +entry: + %tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1) + ret %struct.uint8x8x4_t %tmp +} + +; CHECK-LABEL: test_vld2q_dup_u16 +; CHECK: vld2.16 {d16[], d18[]}, [r1] +; CHECK: vld2.16 {d17[], d19[]}, [r1] +define %struct.uint16x8x2_t @test_vld2q_dup_u16(i8* %src) { +entry: + %tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* %src, i32 2) + ret %struct.uint16x8x2_t %tmp +} + +; CHECK-LABEL: test_vld2q_dup_u32 +; CHECK: vld2.32 {d16[], d18[]}, [r1] +; CHECK: vld2.32 {d17[], d19[]}, [r1] +define %struct.uint32x4x2_t @test_vld2q_dup_u32(i8* %src) { +entry: + %tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* %src, i32 4) + ret %struct.uint32x4x2_t %tmp +} + +; CHECK-LABEL: test_vld2q_dup_u8 +; CHECK: vld2.8 {d16[], d18[]}, [r1] +; CHECK: vld2.8 {d17[], d19[]}, [r1] +define %struct.uint8x16x2_t @test_vld2q_dup_u8(i8* %src) { +entry: + %tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1) + ret %struct.uint8x16x2_t %tmp +} + +; CHECK-LABEL: test_vld3q_dup_u16 +; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1] +; CHECK: vld3.16 {d17[], d19[], d21[]}, [r1] +define %struct.uint16x8x3_t @test_vld3q_dup_u16(i8* %src) { +entry: + %tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* %src, i32 2) + ret %struct.uint16x8x3_t %tmp +} + +; CHECK-LABEL: test_vld3q_dup_u32 +; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1] +; CHECK: vld3.32 {d17[], d19[], d21[]}, [r1] +define %struct.uint32x4x3_t @test_vld3q_dup_u32(i8* %src) { +entry: + %tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* %src, i32 4) + ret %struct.uint32x4x3_t %tmp +} + +; CHECK-LABEL: test_vld3q_dup_u8 +; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1] +; CHECK: vld3.8 {d17[], d19[], d21[]}, [r1] +define %struct.uint8x16x3_t @test_vld3q_dup_u8(i8* %src) { +entry: + %tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1) + ret %struct.uint8x16x3_t %tmp +} + +; CHECK-LABEL: test_vld4q_dup_u16 +; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK: vld4.16 {d17[], d19[], d21[], d23[]}, [r1] +define %struct.uint16x8x4_t @test_vld4q_dup_u16(i8* %src) { +entry: + %tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* %src, i32 2) + ret %struct.uint16x8x4_t %tmp +} + +; CHECK-LABEL: test_vld4q_dup_u32 +; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK: vld4.32 {d17[], d19[], d21[], d23[]}, [r1] +define %struct.uint32x4x4_t @test_vld4q_dup_u32(i8* %src) { +entry: + %tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* %src, i32 4) + ret %struct.uint32x4x4_t %tmp +} + +; CHECK-LABEL: test_vld4q_dup_u8 +; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1] +; CHECK: vld4.8 {d17[], d19[], d21[], d23[]}, [r1] +define %struct.uint8x16x4_t @test_vld4q_dup_u8(i8* %src) { +entry: + %tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1) + ret %struct.uint8x16x4_t %tmp +}