Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.h +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h @@ -813,7 +813,8 @@ MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; - void addMVEVectorTypes(); + void addMVEVectorTypes(bool HasMVEFP); + void setAllExpand(MVT VT); }; enum NEONModImmType { Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -221,20 +221,46 @@ addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } -void ARMTargetLowering::addMVEVectorTypes() { +void ARMTargetLowering::setAllExpand(MVT VT) { + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, VT, Expand); +} + +void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { + const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; + + for (auto VT : IntTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + } + + const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; + for (auto VT : FloatTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + if (!HasMVEFP) + setAllExpand(VT); + + // These are legal or custom whether we have MVE.fp or not + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + } + // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. - - const MVT VecTypes[] = { - MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8, - MVT::v2f64, MVT::v4f32, MVT::v8f16, - }; - - for (auto VT : VecTypes) { + const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; + for (auto VT : LongTypes) { addRegisterClass(VT, &ARM::QPRRegClass); - for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) - setOperationAction(Opc, VT, Expand); + setAllExpand(VT); setOperationAction(ISD::BITCAST, VT, Legal); setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); @@ -569,7 +595,7 @@ setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); if (Subtarget->hasMVEIntegerOps()) - addMVEVectorTypes(); + addMVEVectorTypes(Subtarget->hasMVEFloatOps()); if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); @@ -6427,7 +6453,7 @@ if (SplatUndef.isAllOnesValue()) return DAG.getUNDEF(VT); - if (SplatBitSize <= 64) { + if (ST->hasNEON() && SplatBitSize <= 64) { // Check if an immediate VMOV works. EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), @@ -6559,10 +6585,13 @@ } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; + MVT FVT = VT.getVectorElementType().getSimpleVT(); + assert(FVT == MVT::f32 || FVT == MVT::f16); + MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, Op.getOperand(i))); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) @@ -6588,7 +6617,7 @@ return shuffle; } - if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { + if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector // into two 64-bit vectors; we might discover a better way to lower it. SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElts); @@ -6609,7 +6638,7 @@ // Vectors with 32- or 64-bit elements can be built by directly assigning // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands // will be legalized. - if (EltSize >= 32) { + if (ST->hasNEON() && EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); @@ -6843,6 +6872,38 @@ return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } +enum ShuffleOpCodes { + OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result +}; + +static bool isLegalMVEShuffleOp(unsigned PFEntry) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + switch (OpNum) { + case OP_COPY: + case OP_VREV: + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return true; + } + return false; +} + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values @@ -6864,7 +6925,7 @@ unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) + if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) return true; } @@ -6872,15 +6933,22 @@ unsigned Imm, WhichResult; unsigned EltSize = VT.getScalarSizeInBits(); - return (EltSize >= 32 || - ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isVREVMask(M, VT, 64) || - isVREVMask(M, VT, 32) || - isVREVMask(M, VT, 16) || - isVEXTMask(M, VT, ReverseVEXT, Imm) || - isVTBLMask(M, VT) || - isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || - ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); + if (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isVREVMask(M, VT, 64) || + isVREVMask(M, VT, 32) || + isVREVMask(M, VT, 16)) + return true; + else if (Subtarget->hasNEON() && + (isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTBLMask(M, VT) || + isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) + return true; + else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && + isReverseMask(M, VT)) + return true; + else + return false; } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit @@ -6892,24 +6960,6 @@ unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); - enum { - OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> - OP_VREV, - OP_VDUP0, - OP_VDUP1, - OP_VDUP2, - OP_VDUP3, - OP_VEXT1, - OP_VEXT2, - OP_VEXT3, - OP_VUZPL, // VUZP, left result - OP_VUZPR, // VUZP, right result - OP_VZIPL, // VZIP, left result - OP_VZIPR, // VZIP, right result - OP_VTRNL, // VTRN, left result - OP_VTRNR // VTRN, right result - }; - if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); @@ -6999,7 +7049,8 @@ DAG.getConstant(ExtractNum, DL, MVT::i32)); } -static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); @@ -7045,7 +7096,7 @@ bool ReverseVEXT = false; unsigned Imm = 0; - if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { if (ReverseVEXT) std::swap(V1, V2); return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, @@ -7059,7 +7110,7 @@ if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); - if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { + if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } @@ -7071,12 +7122,14 @@ // used for both shuffles. unsigned WhichResult = 0; bool isV_UNDEF = false; - if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( - ShuffleMask, VT, WhichResult, isV_UNDEF)) { - if (isV_UNDEF) - V2 = V1; - return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) - .getValue(WhichResult); + if (ST->hasNEON()) { + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, VT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + V2 = V1; + return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) + .getValue(WhichResult); + } } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize @@ -7094,7 +7147,7 @@ // -> // concat(VZIP(v1, v2):0, :1) // - if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { + if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { SDValue SubV1 = V1->getOperand(0); SDValue SubV2 = V1->getOperand(1); EVT SubVT = SubV1.getValueType(); @@ -7136,8 +7189,18 @@ unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + if (Cost <= 4) { + if (ST->hasNEON()) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + else if (isLegalMVEShuffleOp(PFEntry)) { + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; + unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; + if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + } } // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. @@ -7162,10 +7225,10 @@ return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) + if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); - if (VT == MVT::v8i8) + if (ST->hasNEON() && VT == MVT::v8i8) if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; @@ -8106,7 +8169,7 @@ case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); - case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); @@ -12007,10 +12070,14 @@ /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. static SDValue PerformVDUPCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); + if (!Subtarget->hasNEON()) + return SDValue(); + // Match VDUP(LOAD) -> VLD1DUP. // We match this pattern here rather than waiting for isel because the // transform is only legal for unindexed loads. @@ -12969,7 +13036,7 @@ case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); - case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); + case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); Index: llvm/trunk/lib/Target/ARM/ARMInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrInfo.td +++ llvm/trunk/lib/Target/ARM/ARMInstrInfo.td @@ -213,6 +213,26 @@ def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; +// Vector operations shared between NEON and MVE + +def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; + +// VDUPLANE can produce a quad-register result from a double-register source, +// so the result is not constrained to match the source. +def ARMvduplane : SDNode<"ARMISD::VDUPLANE", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>>; + +def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; +def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; +def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; + +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; +def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. Index: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td +++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td @@ -1486,6 +1486,30 @@ def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>; +let Predicates = [HasMVEInt] in { +def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))), + (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>; +def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))), + (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>; +def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))), + (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>; + +def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))), + (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>; +def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))), + (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>; + +def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))), + (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>; + +def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))), + (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>; +def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))), + (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>; +def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))), + (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>; +} + def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), "vmvn", "", "$Qd, $Qm", ""> { let Inst{28} = 0b1; @@ -1684,6 +1708,55 @@ def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>; def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>; +let Predicates = [HasMVEInt] in { + def : Pat<(extractelt (v4i32 MQPR:$src), imm:$lane), + (COPY_TO_REGCLASS + (i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>; + def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>; + + def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>; + def : Pat<(vector_insert (v8i16 MQPR:$src1), rGPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_16 MQPR:$src1, rGPR:$src2, imm:$lane)>; + + def : Pat<(ARMvgetlanes (v16i8 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; + + def : Pat<(v16i8 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + def : Pat<(v8i16 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_16 (v8i16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + def : Pat<(v4i32 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_32 (v4i32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + + // Floating point patterns, still enabled under HasMVEInt + def : Pat<(extractelt (v4f32 MQPR:$src), imm:$lane), + (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), SPR)>; + def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane), + (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>; + + def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane), + (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>; + def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane), + (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>; + + def : Pat<(v4f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; + def : Pat<(v4f32 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; + def : Pat<(v8f16 (scalar_to_vector HPR:$src)), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>; + def : Pat<(v8f16 (scalar_to_vector GPR:$src)), + (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; +} + // end of mve_bit instructions // start of MVE Integer instructions @@ -1898,6 +1971,35 @@ def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>; def MVE_VDUP8 : MVE_VDUP<"8", 0b1, 0b0>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP8 rGPR:$elem)>; + def : Pat<(v8i16 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP16 rGPR:$elem)>; + def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP32 rGPR:$elem)>; + + def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)), + (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; + // For the 16-bit and 8-bit vduplanes we don't care about the signedness + // of the lane move operation as we only want the lowest 8/16 bits anyway. + def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)), + (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; + def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)), + (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>; + + def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))), + (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>; + def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))), + (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>; + + def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), + (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; + def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)), + (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; +} + + class MVEIntSingleSrc size, list pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary, Index: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td +++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td @@ -526,11 +526,6 @@ def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; -def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, - SDTCisVT<2, i32>]>; -def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; -def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; - def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; @@ -547,23 +542,10 @@ SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>>; -def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; - -// VDUPLANE can produce a quad-register result from a double-register source, -// so the result is not constrained to match the source. -def NEONvduplane : SDNode<"ARMISD::VDUPLANE", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisVT<2, i32>]>>; - def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>; -def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; -def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; -def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; -def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; - def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; @@ -1411,7 +1393,7 @@ (ins AddrMode:$Rn), IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "", [(set VecListOneDAllLanes:$Vd, - (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>, + (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]>, Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; @@ -1425,7 +1407,7 @@ addrmode6dupalign32>; let Predicates = [HasNEON] in { -def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), +def : Pat<(v2f32 (ARMvdup (f32 (load addrmode6dup:$addr)))), (VLD1DUPd32 addrmode6:$addr)>; } @@ -1435,7 +1417,7 @@ (ins AddrMode:$Rn), IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "", [(set VecListDPairAllLanes:$Vd, - (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> { + (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; @@ -1449,7 +1431,7 @@ addrmode6dupalign32>; let Predicates = [HasNEON] in { -def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), +def : Pat<(v4f32 (ARMvdup (f32 (load addrmode6dup:$addr)))), (VLD1DUPq32 addrmode6:$addr)>; } @@ -2163,11 +2145,11 @@ } def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-5} = lane{2-0}; } def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-6} = lane{1-0}; let Inst{4} = Rn{4}; } @@ -2178,8 +2160,8 @@ let Inst{5-4} = Rn{5-4}; } -def VST1LNq8Pseudo : VST1QLNPseudo; -def VST1LNq16Pseudo : VST1QLNPseudo; +def VST1LNq8Pseudo : VST1QLNPseudo; +def VST1LNq16Pseudo : VST1QLNPseudo; def VST1LNq32Pseudo : VST1QLNPseudo; let Predicates = [HasNEON] in { @@ -2214,11 +2196,11 @@ } def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-5} = lane{2-0}; } def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16, - NEONvgetlaneu, addrmode6> { + ARMvgetlaneu, addrmode6> { let Inst{7-6} = lane{1-0}; let Inst{4} = Rn{4}; } @@ -2228,8 +2210,8 @@ let Inst{5-4} = Rn{5-4}; } -def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo; -def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo; +def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo; +def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo; def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo; let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { @@ -2699,7 +2681,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { + (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; @@ -2711,7 +2693,7 @@ NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","", [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; @@ -2747,7 +2729,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; @@ -2760,7 +2742,7 @@ NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + (ResTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]> { // All of these have a two-operand InstAlias. let TwoOperandAliasConstraint = "$Vn = $Vd"; @@ -2795,7 +2777,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (Ty DPR:$Vd), (Ty (IntOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + (Ty (ARMvduplane (Ty DPR_VFP2:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -2807,7 +2789,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (Ty DPR:$Vd), (Ty (IntOp (Ty DPR:$Vn), - (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { let isCommutable = 0; } class N3VDIntSh op21_20, bits<4> op11_8, bit op4, @@ -2862,7 +2844,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -2874,7 +2856,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$Vn), - (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + (ResTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]> { let isCommutable = 0; } @@ -2910,7 +2892,7 @@ [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$src1), (Ty (MulOp DPR:$Vn, - (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + (Ty (ARMvduplane (Ty DPR_VFP2:$Vm), imm:$lane)))))))]>; class N3VDMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -2923,7 +2905,7 @@ [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$src1), (Ty (MulOp DPR:$Vn, - (Ty (NEONvduplane (Ty DPR_8:$Vm), + (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))))]>; class N3VQMulOp op21_20, bits<4> op11_8, bit op4, @@ -2945,7 +2927,7 @@ [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (MulOp QPR:$Vn, - (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))))]>; class N3VQMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -2959,7 +2941,7 @@ [(set (ResTy QPR:$Vd), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (MulOp QPR:$Vn, - (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + (ResTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))))]>; // Neon Intrinsic-Op instructions (VABA): double- and quad-register. @@ -3019,7 +3001,7 @@ [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), (TyQ (MulOp (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_VFP2:$Vm), + (TyD (ARMvduplane (TyD DPR_VFP2:$Vm), imm:$lane))))))]>; class N3VLMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -3031,7 +3013,7 @@ [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), (TyQ (MulOp (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_8:$Vm), + (TyD (ARMvduplane (TyD DPR_8:$Vm), imm:$lane))))))]>; // Long Intrinsic-Op vector operations with explicit extend (VABAL). @@ -3067,7 +3049,7 @@ [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$src1), (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]>; class N3VLInt3SL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -3080,7 +3062,7 @@ [(set (ResTy QPR:$Vd), (ResTy (IntOp (ResTy QPR:$src1), (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + (OpTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]>; // Narrowing 3-register intrinsics. @@ -3113,7 +3095,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>; + (TyD (ARMvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>; class N3VLSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> @@ -3122,7 +3104,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), - (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>; + (TyD (ARMvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>; // Long 3-register operations with explicitly extended operands. class N3VLExt op21_20, bits<4> op11_8, bit op4, @@ -3178,7 +3160,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]>; class N3VLIntSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, @@ -3188,7 +3170,7 @@ NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", [(set (ResTy QPR:$Vd), (ResTy (IntOp (OpTy DPR:$Vn), - (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + (OpTy (ARMvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]>; // Wide 3-register operations. @@ -4324,43 +4306,43 @@ let Predicates = [HasNEON] in { def : Pat<(v8i16 (mul (v8i16 QPR:$src1), - (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))), (v8i16 (VMULslv8i16 (v8i16 QPR:$src1), (v4i16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (mul (v4i32 QPR:$src1), - (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))), (v4i32 (VMULslv4i32 (v4i32 QPR:$src1), (v2i32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), - (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))), + (v4f32 (ARMvduplane (v4f32 QPR:$src2), imm:$lane)))), (v4f32 (VMULslfq (v4f32 QPR:$src1), (v2f32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; def : Pat<(v8f16 (fmul (v8f16 QPR:$src1), - (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))), + (v8f16 (ARMvduplane (v8f16 QPR:$src2), imm:$lane)))), (v8f16 (VMULslhq(v8f16 QPR:$src1), (v4f16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; -def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), +def : Pat<(v2f32 (fmul DPR:$Rn, (ARMvdup (f32 SPR:$Rm)))), (VMULslfd DPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; -def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), +def : Pat<(v4f16 (fmul DPR:$Rn, (ARMvdup (f16 HPR:$Rm)))), (VMULslhd DPR:$Rn, (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), (i32 0))>; -def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), +def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))), (VMULslfq QPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; -def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), +def : Pat<(v8f16 (fmul QPR:$Rn, (ARMvdup (f16 HPR:$Rm)))), (VMULslhq QPR:$Rn, (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), (i32 0))>; @@ -4376,14 +4358,14 @@ let Predicates = [HasNEON] in { def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1), - (v8i16 (NEONvduplane (v8i16 QPR:$src2), + (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))), (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1), (v4i16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), - (v4i32 (NEONvduplane (v4i32 QPR:$src2), + (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))), (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1), (v2i32 (EXTRACT_SUBREG QPR:$src2, @@ -4401,14 +4383,14 @@ let Predicates = [HasNEON] in { def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1), - (v8i16 (NEONvduplane (v8i16 QPR:$src2), + (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))), (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1), (v4i16 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), - (v4i32 (NEONvduplane (v4i32 QPR:$src2), + (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))), (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1), (v2i32 (EXTRACT_SUBREG QPR:$src2, @@ -4473,7 +4455,7 @@ let Predicates = [HasNEON] in { def : Pat<(v8i16 (add (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))), (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), @@ -4481,7 +4463,7 @@ def : Pat<(v4i32 (add (v4i32 QPR:$src1), (mul (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))), (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), @@ -4490,7 +4472,7 @@ def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1), (fmul_su (v4f32 QPR:$src2), - (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLAslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, @@ -4542,7 +4524,7 @@ (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; @@ -4550,7 +4532,7 @@ (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; @@ -4558,7 +4540,7 @@ (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane)))))), (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), @@ -4570,7 +4552,7 @@ (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane)))))), (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), @@ -4612,14 +4594,14 @@ (v4i16 DPR:$src1), (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; def : Pat<(v2i32 (int_arm_neon_vqsubs (v2i32 DPR:$src1), (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; @@ -4627,7 +4609,7 @@ (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane)))))), (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), @@ -4639,7 +4621,7 @@ (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane)))))), (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), @@ -4664,12 +4646,12 @@ (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; } @@ -4707,7 +4689,7 @@ let Predicates = [HasNEON] in { def : Pat<(v8i16 (sub (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), - (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))), (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), @@ -4715,7 +4697,7 @@ def : Pat<(v4i32 (sub (v4i32 QPR:$src1), (mul (v4i32 QPR:$src2), - (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))), (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), @@ -4724,7 +4706,7 @@ def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1), (fmul_su (v4f32 QPR:$src2), - (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), @@ -4756,12 +4738,12 @@ (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), - (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), imm:$lane)))))), (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), - (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; } @@ -4859,7 +4841,7 @@ (AccumType (OpNode (AccumType Ty:$Vd), (InputType Ty:$Vn), (InputType (bitconvert (AccumType - (NEONvduplane (AccumType Ty:$Vm), + (ARMvduplane (AccumType Ty:$Vm), VectorIndex32:$lane)))))), (!cast(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>; } @@ -6201,7 +6183,7 @@ def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane), IIC_VMOVSI, "vmov", "s8", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V), + [(set GPR:$R, (ARMvgetlanes (v8i8 DPR:$V), imm:$lane))]> { let Inst{21} = lane{2}; let Inst{6-5} = lane{1-0}; @@ -6209,7 +6191,7 @@ def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1}, (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane), IIC_VMOVSI, "vmov", "s16", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V), + [(set GPR:$R, (ARMvgetlanes (v4i16 DPR:$V), imm:$lane))]> { let Inst{21} = lane{1}; let Inst{6} = lane{0}; @@ -6217,7 +6199,7 @@ def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?}, (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane), IIC_VMOVSI, "vmov", "u8", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V), + [(set GPR:$R, (ARMvgetlaneu (v8i8 DPR:$V), imm:$lane))]> { let Inst{21} = lane{2}; let Inst{6-5} = lane{1-0}; @@ -6225,7 +6207,7 @@ def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1}, (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane), IIC_VMOVSI, "vmov", "u16", "$R, $V$lane", - [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V), + [(set GPR:$R, (ARMvgetlaneu (v4i16 DPR:$V), imm:$lane))]> { let Inst{21} = lane{1}; let Inst{6} = lane{0}; @@ -6240,19 +6222,19 @@ } let Predicates = [HasNEON] in { // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td -def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlanes (v16i8 QPR:$src), imm:$lane), (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane))>; -def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlanes (v8i16 QPR:$src), imm:$lane), (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane))>; -def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlaneu (v16i8 QPR:$src), imm:$lane), (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src, (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane))>; -def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane), +def : Pat<(ARMvgetlaneu (v8i16 QPR:$src), imm:$lane), (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane))>; @@ -6418,11 +6400,11 @@ class VDUPD opcod1, bits<2> opcod3, string Dt, ValueType Ty> : NVDup; + [(set DPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>; class VDUPQ opcod1, bits<2> opcod3, string Dt, ValueType Ty> : NVDup; + [(set QPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>; def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>; def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>; @@ -6432,16 +6414,16 @@ def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>; def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>; -// NEONvdup patterns for uarchs with fast VDUP.32. -def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>, +// ARMvdup patterns for uarchs with fast VDUP.32. +def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>, Requires<[HasNEON,HasFastVDUP32]>; -def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>, +def : Pat<(v4f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>, Requires<[HasNEON]>; -// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead. -def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>, +// ARMvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead. +def : Pat<(v2i32 (ARMvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>, Requires<[HasNEON,HasSlowVDUP32]>; -def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>, +def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>, Requires<[HasNEON,HasSlowVDUP32]>; // VDUP : Vector Duplicate Lane (from scalar to all elements) @@ -6450,13 +6432,13 @@ ValueType Ty, Operand IdxTy> : NVDupLane; + [(set DPR:$Vd, (Ty (ARMvduplane (Ty DPR:$Vm), imm:$lane)))]>; class VDUPLNQ op19_16, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Operand IdxTy> : NVDupLane; // Inst{19-16} is partially specified depending on the element size. @@ -6487,46 +6469,46 @@ } let Predicates = [HasNEON] in { -def : Pat<(v4f16 (NEONvduplane (v4f16 DPR:$Vm), imm:$lane)), +def : Pat<(v4f16 (ARMvduplane (v4f16 DPR:$Vm), imm:$lane)), (VDUPLN32d DPR:$Vm, imm:$lane)>; -def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), +def : Pat<(v2f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)), (VDUPLN32d DPR:$Vm, imm:$lane)>; -def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), +def : Pat<(v4f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)), (VDUPLN32q DPR:$Vm, imm:$lane)>; -def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), +def : Pat<(v16i8 (ARMvduplane (v16i8 QPR:$src), imm:$lane)), (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane)))>; -def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)), +def : Pat<(v8i16 (ARMvduplane (v8i16 QPR:$src), imm:$lane)), (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; -def : Pat<(v8f16 (NEONvduplane (v8f16 QPR:$src), imm:$lane)), +def : Pat<(v8f16 (ARMvduplane (v8f16 QPR:$src), imm:$lane)), (v8f16 (VDUPLN16q (v4f16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; -def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), +def : Pat<(v4i32 (ARMvduplane (v4i32 QPR:$src), imm:$lane)), (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), +def : Pat<(v4f32 (ARMvduplane (v4f32 QPR:$src), imm:$lane)), (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f16 (NEONvdup HPR:$src)), +def : Pat<(v4f16 (ARMvdup HPR:$src)), (v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$src, ssub_0), (i32 0)))>; -def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))), +def : Pat<(v2f32 (ARMvdup (f32 SPR:$src))), (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; -def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))), +def : Pat<(v4f32 (ARMvdup (f32 SPR:$src))), (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; -def : Pat<(v8f16 (NEONvdup HPR:$src)), +def : Pat<(v8f16 (ARMvdup HPR:$src)), (v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$src, ssub_0), (i32 0)))>; } @@ -6728,18 +6710,18 @@ : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>; + [(set DPR:$Vd, (Ty (ARMvrev64 (Ty DPR:$Vm))))]>; class VREV64Q op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>; + [(set QPR:$Vd, (Ty (ARMvrev64 (Ty QPR:$Vm))))]>; def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>; def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>; def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>; let Predicates = [HasNEON] in { -def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>; +def : Pat<(v2f32 (ARMvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>; } def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>; @@ -6747,9 +6729,9 @@ def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; let Predicates = [HasNEON] in { -def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; -def : Pat<(v8f16 (NEONvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>; -def : Pat<(v4f16 (NEONvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>; +def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; +def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>; +def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>; } // VREV32 : Vector Reverse elements within 32-bit words @@ -6758,12 +6740,12 @@ : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>; + [(set DPR:$Vd, (Ty (ARMvrev32 (Ty DPR:$Vm))))]>; class VREV32Q op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>; + [(set QPR:$Vd, (Ty (ARMvrev32 (Ty QPR:$Vm))))]>; def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>; def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>; @@ -6777,12 +6759,12 @@ : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>; + [(set DPR:$Vd, (Ty (ARMvrev16 (Ty DPR:$Vm))))]>; class VREV16Q op19_18, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm", "", - [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>; + [(set QPR:$Vd, (Ty (ARMvrev16 (Ty QPR:$Vm))))]>; def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>; def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; Index: llvm/trunk/test/CodeGen/Thumb2/mve-bitcasts.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-bitcasts.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-bitcasts.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_i64(<2 x i64> %src) { ; CHECK-LABEL: bitcast_i64_i64: Index: llvm/trunk/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-shuffle.ll @@ -0,0 +1,615 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP + +define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) { +; CHECK-LABEL: shuffle1_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @shuffle2_i32(<4 x i32> %src) { +; CHECK-LABEL: shuffle2_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) { +; CHECK-LABEL: shuffle3_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @shuffle5_i32(<4 x i32> %src) { +; CHECK-LABEL: shuffle5_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.32 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @shuffle6_i32(<4 x i32> %src) { +; CHECK-LABEL: shuffle6_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) { +; CHECK-LABEL: shuffle1_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @shuffle2_i16(<8 x i16> %src) { +; CHECK-LABEL: shuffle2_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { +; CHECK-LABEL: shuffle3_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @shuffle5_i16(<8 x i16> %src) { +; CHECK-LABEL: shuffle5_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.16 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @shuffle6_i16(<8 x i16> %src) { +; CHECK-LABEL: shuffle6_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev32.16 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) { +; CHECK-LABEL: shuffle1_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle2_i8(<16 x i8> %src) { +; CHECK-LABEL: shuffle2_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) { +; CHECK-LABEL: shuffle3_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle5_i8(<16 x i8> %src) { +; CHECK-LABEL: shuffle5_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.8 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle6_i8(<16 x i8> %src) { +; CHECK-LABEL: shuffle6_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev32.8 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle7_i8(<16 x i8> %src) { +; CHECK-LABEL: shuffle7_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev16.8 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) { +; CHECK-LABEL: shuffle1_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> + ret <4 x float> %out +} + +define arm_aapcs_vfpcc <4 x float> @shuffle2_f32(<4 x float> %src) { +; CHECK-LABEL: shuffle2_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> + ret <4 x float> %out +} + +define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) { +; CHECK-LABEL: shuffle3_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> + ret <4 x float> %out +} + +define arm_aapcs_vfpcc <4 x float> @shuffle5_f32(<4 x float> %src) { +; CHECK-LABEL: shuffle5_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.32 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> + ret <4 x float> %out +} + +define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) { +; CHECK-MVE-LABEL: shuffle1_f16: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[7] +; CHECK-MVE-NEXT: vdup.16 q1, r2 +; CHECK-MVE-NEXT: vmov.u16 r1, q0[6] +; CHECK-MVE-NEXT: vmov.16 q1[0], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[5] +; CHECK-MVE-NEXT: vmov.16 q1[1], r1 +; CHECK-MVE-NEXT: vmov.16 q1[2], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVE-NEXT: vmov.16 q1[3], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVE-NEXT: vmov.16 q1[4], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVE-NEXT: vmov.16 q1[5], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[1] +; CHECK-MVE-NEXT: vmov.16 q1[6], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[0] +; CHECK-MVE-NEXT: vmov.16 q1[7], r0 +; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle1_f16: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[7] +; CHECK-MVEFP-NEXT: vmov.u16 r1, q0[6] +; CHECK-MVEFP-NEXT: vmov.16 q1[0], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[5] +; CHECK-MVEFP-NEXT: vmov.16 q1[1], r1 +; CHECK-MVEFP-NEXT: vmov.16 q1[2], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVEFP-NEXT: vmov.16 q1[3], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVEFP-NEXT: vmov.16 q1[4], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVEFP-NEXT: vmov.16 q1[5], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[1] +; CHECK-MVEFP-NEXT: vmov.16 q1[6], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[0] +; CHECK-MVEFP-NEXT: vmov.16 q1[7], r0 +; CHECK-MVEFP-NEXT: vmov q0, q1 +; CHECK-MVEFP-NEXT: bx lr +entry: + %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> + ret <8 x half> %out +} + +define arm_aapcs_vfpcc <8 x half> @shuffle2_f16(<8 x half> %src) { +; CHECK-LABEL: shuffle2_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> + ret <8 x half> %out +} + +define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { +; CHECK-MVE-LABEL: shuffle3_f16: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVE-NEXT: vdup.16 q1, r2 +; CHECK-MVE-NEXT: vmov.u16 r1, q0[5] +; CHECK-MVE-NEXT: vmov.16 q1[0], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[7] +; CHECK-MVE-NEXT: vmov.16 q1[1], r1 +; CHECK-MVE-NEXT: vmov.16 q1[2], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[6] +; CHECK-MVE-NEXT: vmov.16 q1[3], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVE-NEXT: vmov.16 q1[4], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[1] +; CHECK-MVE-NEXT: vmov.16 q1[5], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVE-NEXT: vmov.16 q1[6], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[0] +; CHECK-MVE-NEXT: vmov.16 q1[7], r0 +; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle3_f16: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVEFP-NEXT: vmov.u16 r1, q0[5] +; CHECK-MVEFP-NEXT: vmov.16 q1[0], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[7] +; CHECK-MVEFP-NEXT: vmov.16 q1[1], r1 +; CHECK-MVEFP-NEXT: vmov.16 q1[2], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[6] +; CHECK-MVEFP-NEXT: vmov.16 q1[3], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVEFP-NEXT: vmov.16 q1[4], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[1] +; CHECK-MVEFP-NEXT: vmov.16 q1[5], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVEFP-NEXT: vmov.16 q1[6], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[0] +; CHECK-MVEFP-NEXT: vmov.16 q1[7], r0 +; CHECK-MVEFP-NEXT: vmov q0, q1 +; CHECK-MVEFP-NEXT: bx lr +entry: + %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> + ret <8 x half> %out +} + +define arm_aapcs_vfpcc <8 x half> @shuffle5_f16(<8 x half> %src) { +; CHECK-LABEL: shuffle5_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.16 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> + ret <8 x half> %out +} + +define arm_aapcs_vfpcc <8 x half> @shuffle6_f16(<8 x half> %src) { +; CHECK-LABEL: shuffle6_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev32.16 q0, q0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> + ret <8 x half> %out +} + + +define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) { +; CHECK-LABEL: insert_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: bx lr +entry: + %res = insertelement <4 x i32> undef, i32 %a, i32 0 + ret <4 x i32> %res +} + +define arm_aapcs_vfpcc <8 x i16> @insert_i16(i16 %a) { +; CHECK-LABEL: insert_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: bx lr +entry: + %res = insertelement <8 x i16> undef, i16 %a, i32 0 + ret <8 x i16> %res +} + +define arm_aapcs_vfpcc <16 x i8> @insert_i8(i8 %a) { +; CHECK-LABEL: insert_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: bx lr +entry: + %res = insertelement <16 x i8> undef, i8 %a, i32 0 + ret <16 x i8> %res +} + +define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) { +; CHECK-LABEL: insert_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: bx lr +entry: + %res = insertelement <4 x float> undef, float %a, i32 0 + ret <4 x float> %res +} + +; TODO: Calling convention needs fixing to pass half types directly to functions +define arm_aapcs_vfpcc <8 x half> @insert_f16(half *%aa) { +; CHECK-LABEL: insert_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %a = load half, half* %aa + %res = insertelement <8 x half> undef, half %a, i32 0 + ret <8 x half> %res +} + +define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) { +; CHECK-LABEL: scalar_to_vector_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: movs r0, #7 +; CHECK-NEXT: movs r1, #1 +; CHECK-NEXT: strh.w r0, [sp, #2] +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: strh.w r0, [sp] +; CHECK-NEXT: movt r1, #9 +; CHECK-NEXT: ldr r0, [sp] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %f = shufflevector <8 x i16> %v, <8 x i16> , <4 x i32> + %0 = bitcast <4 x i16> %f to i64 + ret i64 %0 +} + + +define arm_aapcs_vfpcc i32 @extract_i32_0(<4 x i32> %a) { +; CHECK-LABEL: extract_i32_0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %res = extractelement <4 x i32> %a, i32 0 + ret i32 %res +} + +define arm_aapcs_vfpcc i32 @extract_i32_3(<4 x i32> %a) { +; CHECK-LABEL: extract_i32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: bx lr +entry: + %res = extractelement <4 x i32> %a, i32 3 + ret i32 %res +} + +define arm_aapcs_vfpcc i16 @extract_i16_0(<8 x i16> %a) { +; CHECK-LABEL: extract_i16_0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: bx lr +entry: + %res = extractelement <8 x i16> %a, i32 0 + ret i16 %res +} + +define arm_aapcs_vfpcc i16 @extract_i16_3(<8 x i16> %a) { +; CHECK-LABEL: extract_i16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: bx lr +entry: + %res = extractelement <8 x i16> %a, i32 3 + ret i16 %res +} + +define arm_aapcs_vfpcc i8 @extract_i8_0(<16 x i8> %a) { +; CHECK-LABEL: extract_i8_0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: bx lr +entry: + %res = extractelement <16 x i8> %a, i32 0 + ret i8 %res +} + +define arm_aapcs_vfpcc i8 @extract_i8_3(<16 x i8> %a) { +; CHECK-LABEL: extract_i8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: bx lr +entry: + %res = extractelement <16 x i8> %a, i32 3 + ret i8 %res +} + +define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) { +; CHECK-LABEL: extract_f32_0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: bx lr +entry: + %res = extractelement <4 x float> %a, i32 0 + ret float %res +} + +define arm_aapcs_vfpcc float @extract_f32_3(<4 x float> %a) { +; CHECK-LABEL: extract_f32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s0, s3 +; CHECK-NEXT: bx lr +entry: + %res = extractelement <4 x float> %a, i32 3 + ret float %res +} + +define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) { +; CHECK-LABEL: extract_f16_0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %res = extractelement <8 x half> %a, i32 0 + ret half %res +} + +define arm_aapcs_vfpcc half @extract_f16_3(<8 x half> %a) { +; CHECK-LABEL: extract_f16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %res = extractelement <8 x half> %a, i32 3 + ret half %res +} Index: llvm/trunk/test/CodeGen/Thumb2/mve-vdup.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-vdup.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-vdup.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @vdup_i32(i32 %src) { +; CHECK-LABEL: vdup_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = insertelement <4 x i32> undef, i32 %src, i32 0 + %out = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vdup_i16(i16 %src) { +; CHECK-LABEL: vdup_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = insertelement <8 x i16> undef, i16 %src, i32 0 + %out = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vdup_i8(i8 %src) { +; CHECK-LABEL: vdup_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.8 q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = insertelement <16 x i8> undef, i8 %src, i32 0 + %out = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <4 x float> @vdup_f32_1(float %src) { +; CHECK-LABEL: vdup_f32_1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = insertelement <4 x float> undef, float %src, i32 0 + %out = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %out +} + +define arm_aapcs_vfpcc <4 x float> @vdup_f32_2(float %src1, float %src2) { +; CHECK-LABEL: vdup_f32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = fadd float %src1, %src2 + %1 = insertelement <4 x float> undef, float %0, i32 0 + %out = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %out +} + +; TODO: Calling convention needs fixing to pass half types directly to functions +define arm_aapcs_vfpcc <8 x half> @vdup_f16(half* %src1, half* %src2) { +; CHECK-LABEL: vdup_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = load half, half *%src1, align 2 + %1 = load half, half *%src2, align 2 + %2 = fadd half %0, %1 + %3 = insertelement <8 x half> undef, half %2, i32 0 + %out = shufflevector <8 x half> %3, <8 x half> undef, <8 x i32> zeroinitializer + ret <8 x half> %out +} + + + +define arm_aapcs_vfpcc <4 x i32> @vduplane_i32(<4 x i32> %src) { +; CHECK-LABEL: vduplane_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.32 r0, q0[3] +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vduplane_i16(<8 x i16> %src) { +; CHECK-LABEL: vduplane_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vduplane_i8(<16 x i8> %src) { +; CHECK-LABEL: vduplane_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vdup.8 q0, r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <4 x float> @vduplane_f32(<4 x float> %src) { +; CHECK-LABEL: vduplane_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.32 r0, q0[3] +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> + ret <4 x float> %out +} + +define arm_aapcs_vfpcc <8 x half> @vduplane_f16(<8 x half> %src) { +; CHECK-LABEL: vduplane_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> + ret <8 x half> %out +}