Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -139,32 +139,36 @@ VCGTU, // Vector compare unsigned greater than. VTST, // Vector test bits. + // Vector shift by vector + VSHLs, // ...left/right by signed + VSHLu, // ...left/right by unsigned + // Vector shift by immediate: - VSHL, // ...left - VSHRs, // ...right (signed) - VSHRu, // ...right (unsigned) + VSHLimm, // ...left + VSHRsimm, // ...right (signed) + VSHRuimm, // ...right (unsigned) // Vector rounding shift by immediate: - VRSHRs, // ...right (signed) - VRSHRu, // ...right (unsigned) - VRSHRN, // ...right narrow + VRSHRsimm, // ...right (signed) + VRSHRuimm, // ...right (unsigned) + VRSHRNimm, // ...right narrow // Vector saturating shift by immediate: - VQSHLs, // ...left (signed) - VQSHLu, // ...left (unsigned) - VQSHLsu, // ...left (signed to unsigned) - VQSHRNs, // ...right narrow (signed) - VQSHRNu, // ...right narrow (unsigned) - VQSHRNsu, // ...right narrow (signed to unsigned) + VQSHLsimm, // ...left (signed) + VQSHLuimm, // ...left (unsigned) + VQSHLsuimm, // ...left (signed to unsigned) + VQSHRNsimm, // ...right narrow (signed) + VQSHRNuimm, // ...right narrow (unsigned) + VQSHRNsuimm, // ...right narrow (signed to unsigned) // Vector saturating rounding shift by immediate: - VQRSHRNs, // ...right narrow (signed) - VQRSHRNu, // ...right narrow (unsigned) - VQRSHRNsu, // ...right narrow (signed to unsigned) + VQRSHRNsimm, // ...right narrow (signed) + VQRSHRNuimm, // ...right narrow (unsigned) + VQRSHRNsuimm, // ...right narrow (signed to unsigned) // Vector shift and insert: - VSLI, // ...left - VSRI, // ...right + VSLIimm, // ...left + VSRIimm, // ...right // Vector get lane (VMOV scalar to ARM core register) // (These are used for 8- and 16-bit element types only.) Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -250,6 +250,9 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); @@ -1541,23 +1544,25 @@ case ARMISD::VCGTU: return "ARMISD::VCGTU"; case ARMISD::VTST: return "ARMISD::VTST"; - case ARMISD::VSHL: return "ARMISD::VSHL"; - case ARMISD::VSHRs: return "ARMISD::VSHRs"; - case ARMISD::VSHRu: return "ARMISD::VSHRu"; - case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; - case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; - case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; - case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; - case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; - case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; - case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; - case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; - case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; - case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; - case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; - case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; - case ARMISD::VSLI: return "ARMISD::VSLI"; - case ARMISD::VSRI: return "ARMISD::VSRI"; + case ARMISD::VSHLs: return "ARMISD::VSHLs"; + case ARMISD::VSHLu: return "ARMISD::VSHLu"; + case ARMISD::VSHLimm: return "ARMISD::VSHLimm"; + case ARMISD::VSHRsimm: return "ARMISD::VSHRsimm"; + case ARMISD::VSHRuimm: return "ARMISD::VSHRuimm"; + case ARMISD::VRSHRsimm: return "ARMISD::VRSHRsimm"; + case ARMISD::VRSHRuimm: return "ARMISD::VRSHRuimm"; + case ARMISD::VRSHRNimm: return "ARMISD::VRSHRNimm"; + case ARMISD::VQSHLsimm: return "ARMISD::VQSHLsimm"; + case ARMISD::VQSHLuimm: return "ARMISD::VQSHLuimm"; + case ARMISD::VQSHLsuimm: return "ARMISD::VQSHLsuimm"; + case ARMISD::VQSHRNsimm: return "ARMISD::VQSHRNsimm"; + case ARMISD::VQSHRNuimm: return "ARMISD::VQSHRNuimm"; + case ARMISD::VQSHRNsuimm: return "ARMISD::VQSHRNsuimm"; + case ARMISD::VQRSHRNsimm: return "ARMISD::VQRSHRNsimm"; + case ARMISD::VQRSHRNuimm: return "ARMISD::VQRSHRNuimm"; + case ARMISD::VQRSHRNsuimm: return "ARMISD::VQRSHRNsuimm"; + case ARMISD::VSLIimm: return "ARMISD::VSLIimm"; + case ARMISD::VSRIimm: return "ARMISD::VSRIimm"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; @@ -5174,7 +5179,7 @@ DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) - Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Mask = DAG.getNode(ARMISD::VSHLimm, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ @@ -5182,11 +5187,11 @@ if (SrcVT == MVT::f32) { Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); if (VT == MVT::f64) - Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Tmp1 = DAG.getNode(ARMISD::VSHLimm, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) - Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + Tmp1 = DAG.getNode(ARMISD::VSHRuimm, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); @@ -5691,40 +5696,100 @@ return Res; } +/// Getvshiftimm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || + !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { + Cnt = -Cnt; + return true; + } + return false; +} + static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); + int64_t Cnt; if (!VT.isVector()) return SDValue(); - // Lower vector shifts on NEON to use VSHL. - assert(ST->hasNEON() && "unexpected vector shift"); + // We essentially have two forms here. Shift by an immediate and shift by a + // vector register (there are also shift by a gpr, but that is just handled + // with a tablegen pattern). We cannot easily match shift by an immediate in + // tablegen so we do that here and generate a VSHLimm/VSHRsimm/VSHRuimm. + // For shifting by a vector, we don't have VSHR, only VSHL (which can be + // signed or unsigned, and a negative shift indicates a shift right). + if (N->getOpcode() == ISD::SHL) { + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) + return DAG.getNode(ARMISD::VSHLimm, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), + N->getOperand(1)); + } - // Left shifts translate directly to the vshiftu intrinsic. - if (N->getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, - MVT::i32), - N->getOperand(0), N->getOperand(1)); + assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && + "unexpected vector shift opcode"); - assert((N->getOpcode() == ISD::SRA || - N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsimm : ARMISD::VSHRuimm); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + } - // NEON uses the same intrinsics for both left and right shifts. For - // right shifts, the shift amounts are negative, so negate the vector of - // shift amounts. + // Other right shifts we don't have operations for (we use a shift left by a + // negative number). EVT ShiftVT = N->getOperand(1).getValueType(); - SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, - getZeroVector(ShiftVT, DAG, dl), - N->getOperand(1)); - Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? - Intrinsic::arm_neon_vshifts : - Intrinsic::arm_neon_vshiftu); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(vshiftInt, dl, MVT::i32), - N->getOperand(0), NegatedCount); + SDValue NegatedCount = DAG.getNode( + ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); } static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, @@ -12612,58 +12677,6 @@ ConvInput, DAG.getConstant(C, dl, MVT::i32)); } -/// Getvshiftimm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift operation, where all the elements of the -/// build_vector must have the same constant integer value. -static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { - // Ignore bit_converts. - while (Op.getOpcode() == ISD::BITCAST) - Op = Op.getOperand(0); - BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ElementBits) || - SplatBitSize > ElementBits) - return false; - Cnt = SplatBits.getSExtValue(); - return true; -} - -/// isVShiftLImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift left operation. That value must be in the range: -/// 0 <= Value < ElementBits for a left shift; or -/// 0 <= Value <= ElementBits for a long left shift. -static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); -} - -/// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - if (!isIntrinsic) - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); - if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { - Cnt = -Cnt; - return true; - } - return false; -} - /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); @@ -12699,12 +12712,12 @@ case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { - VShiftOpc = ARMISD::VSHL; + VShiftOpc = ARMISD::VSHLimm; break; } if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { - VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? - ARMISD::VSHRs : ARMISD::VSHRu); + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsimm + : ARMISD::VSHRuimm); break; } return SDValue(); @@ -12749,29 +12762,41 @@ // Opcode already set above. break; case Intrinsic::arm_neon_vrshifts: - VShiftOpc = ARMISD::VRSHRs; break; + VShiftOpc = ARMISD::VRSHRsimm; + break; case Intrinsic::arm_neon_vrshiftu: - VShiftOpc = ARMISD::VRSHRu; break; + VShiftOpc = ARMISD::VRSHRuimm; + break; case Intrinsic::arm_neon_vrshiftn: - VShiftOpc = ARMISD::VRSHRN; break; + VShiftOpc = ARMISD::VRSHRNimm; + break; case Intrinsic::arm_neon_vqshifts: - VShiftOpc = ARMISD::VQSHLs; break; + VShiftOpc = ARMISD::VQSHLsimm; + break; case Intrinsic::arm_neon_vqshiftu: - VShiftOpc = ARMISD::VQSHLu; break; + VShiftOpc = ARMISD::VQSHLuimm; + break; case Intrinsic::arm_neon_vqshiftsu: - VShiftOpc = ARMISD::VQSHLsu; break; + VShiftOpc = ARMISD::VQSHLsuimm; + break; case Intrinsic::arm_neon_vqshiftns: - VShiftOpc = ARMISD::VQSHRNs; break; + VShiftOpc = ARMISD::VQSHRNsimm; + break; case Intrinsic::arm_neon_vqshiftnu: - VShiftOpc = ARMISD::VQSHRNu; break; + VShiftOpc = ARMISD::VQSHRNuimm; + break; case Intrinsic::arm_neon_vqshiftnsu: - VShiftOpc = ARMISD::VQSHRNsu; break; + VShiftOpc = ARMISD::VQSHRNsuimm; + break; case Intrinsic::arm_neon_vqrshiftns: - VShiftOpc = ARMISD::VQRSHRNs; break; + VShiftOpc = ARMISD::VQRSHRNsimm; + break; case Intrinsic::arm_neon_vqrshiftnu: - VShiftOpc = ARMISD::VQRSHRNu; break; + VShiftOpc = ARMISD::VQRSHRNuimm; + break; case Intrinsic::arm_neon_vqrshiftnsu: - VShiftOpc = ARMISD::VQRSHRNsu; break; + VShiftOpc = ARMISD::VQRSHRNsuimm; + break; } SDLoc dl(N); @@ -12785,9 +12810,9 @@ unsigned VShiftOpc = 0; if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) - VShiftOpc = ARMISD::VSLI; + VShiftOpc = ARMISD::VSLIimm; else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) - VShiftOpc = ARMISD::VSRI; + VShiftOpc = ARMISD::VSRIimm; else { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } @@ -12869,7 +12894,6 @@ if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); - assert(ST->hasNEON() && "unexpected vector shift"); int64_t Cnt; switch (N->getOpcode()) { @@ -12878,7 +12902,7 @@ case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { SDLoc dl(N); - return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), + return DAG.getNode(ARMISD::VSHLimm, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } break; @@ -12886,8 +12910,8 @@ case ISD::SRA: case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { - unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? - ARMISD::VSHRs : ARMISD::VSHRu); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsimm : ARMISD::VSHRuimm); SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); @@ -13653,7 +13677,7 @@ SDNode *U = *ExtVal->use_begin(); if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || - U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) + U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLimm)) return false; return true; Index: llvm/lib/Target/ARM/ARMInstrInfo.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrInfo.td +++ llvm/lib/Target/ARM/ARMInstrInfo.td @@ -254,6 +254,17 @@ def ARMvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>; + +def SDTARMVSHimm : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>,]>; +def ARMvshlImm : SDNode<"ARMISD::VSHLimm", SDTARMVSHimm>; +def ARMvshrsImm : SDNode<"ARMISD::VSHRsimm", SDTARMVSHimm>; +def ARMvshruImm : SDNode<"ARMISD::VSHRuimm", SDTARMVSHimm>; +def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; +def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; + def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop, [SDNPHasChain]>; Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2122,6 +2122,22 @@ defm MVE_VQRSHL_by_vec : mve_shift_by_vec_multi<"vqrshl", 0b1, 0b1>; defm MVE_VRSHL_by_vec : mve_shift_by_vec_multi<"vrshl", 0b0, 0b1>; +let Predicates = [HasMVEInt] in { +def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))), + (v4i32 (MVE_VSHL_by_vecu32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>; +def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))), + (v8i16 (MVE_VSHL_by_vecu16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>; +def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))), + (v16i8 (MVE_VSHL_by_vecu8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>; + +def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))), + (v4i32 (MVE_VSHL_by_vecs32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>; +def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))), + (v8i16 (MVE_VSHL_by_vecs16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>; +def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))), + (v16i8 (MVE_VSHL_by_vecs8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>; +} + class MVE_shift_with_imm pattern=[]> @@ -2346,6 +2362,30 @@ def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> { let Inst{21} = 0b1; } + +let Predicates = [HasMVEInt] in { +def : Pat<(v4i32 (ARMvshlImm (v4i32 MQPR:$src), imm0_31:$imm)), + (v4i32 (MVE_VSHL_immi32 (v4i32 MQPR:$src), imm0_31:$imm))>; +def : Pat<(v8i16 (ARMvshlImm (v8i16 MQPR:$src), imm0_15:$imm)), + (v8i16 (MVE_VSHL_immi16 (v8i16 MQPR:$src), imm0_15:$imm))>; +def : Pat<(v16i8 (ARMvshlImm (v16i8 MQPR:$src), imm0_7:$imm)), + (v16i8 (MVE_VSHL_immi8 (v16i8 MQPR:$src), imm0_7:$imm))>; + +def : Pat<(v4i32 (ARMvshruImm (v4i32 MQPR:$src), imm0_31:$imm)), + (v4i32 (MVE_VSHR_immu32 (v4i32 MQPR:$src), imm0_31:$imm))>; +def : Pat<(v8i16 (ARMvshruImm (v8i16 MQPR:$src), imm0_15:$imm)), + (v8i16 (MVE_VSHR_immu16 (v8i16 MQPR:$src), imm0_15:$imm))>; +def : Pat<(v16i8 (ARMvshruImm (v16i8 MQPR:$src), imm0_7:$imm)), + (v16i8 (MVE_VSHR_immu8 (v16i8 MQPR:$src), imm0_7:$imm))>; + +def : Pat<(v4i32 (ARMvshrsImm (v4i32 MQPR:$src), imm0_31:$imm)), + (v4i32 (MVE_VSHR_imms32 (v4i32 MQPR:$src), imm0_31:$imm))>; +def : Pat<(v8i16 (ARMvshrsImm (v8i16 MQPR:$src), imm0_15:$imm)), + (v8i16 (MVE_VSHR_imms16 (v8i16 MQPR:$src), imm0_15:$imm))>; +def : Pat<(v16i8 (ARMvshrsImm (v16i8 MQPR:$src), imm0_7:$imm)), + (v16i8 (MVE_VSHR_imms8 (v16i8 MQPR:$src), imm0_7:$imm))>; +} + // end of mve_shift instructions // start of MVE Floating Point instructions @@ -3358,6 +3398,22 @@ defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; +let Predicates = [HasMVEInt] in { +def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), + (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; +def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), + (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>; +def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), + (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + +def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), + (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>; +def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), + (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; +def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), + (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; +} + class MVE_VBRSR size, list pattern=[]> : MVE_qDest_rSrc { Index: llvm/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrNEON.td +++ llvm/lib/Target/ARM/ARMInstrNEON.td @@ -496,35 +496,30 @@ // Types for vector shift by immediates. The "SHX" version is for long and // narrow operations where the source and destination vectors have different // types. The "SHINS" version is for shift and insert operations. -def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisVT<2, i32>]>; -def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, - SDTCisVT<2, i32>]>; -def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; +def SDTARMVSHXimm : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHINSimm : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; -def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>; -def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>; -def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>; -def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>; +def NEONvshrnImm : SDNode<"ARMISD::VSHRNimm", SDTARMVSHXimm>; -def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>; -def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>; -def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>; +def NEONvrshrsImm : SDNode<"ARMISD::VRSHRsimm", SDTARMVSHimm>; +def NEONvrshruImm : SDNode<"ARMISD::VRSHRuimm", SDTARMVSHimm>; +def NEONvrshrnImm : SDNode<"ARMISD::VRSHRNimm", SDTARMVSHXimm>; -def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>; -def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>; -def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>; -def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>; -def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>; -def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>; +def NEONvqshlsImm : SDNode<"ARMISD::VQSHLsimm", SDTARMVSHimm>; +def NEONvqshluImm : SDNode<"ARMISD::VQSHLuimm", SDTARMVSHimm>; +def NEONvqshlsuImm : SDNode<"ARMISD::VQSHLsuimm", SDTARMVSHimm>; +def NEONvqshrnsImm : SDNode<"ARMISD::VQSHRNsimm", SDTARMVSHXimm>; +def NEONvqshrnuImm : SDNode<"ARMISD::VQSHRNuimm", SDTARMVSHXimm>; +def NEONvqshrnsuImm : SDNode<"ARMISD::VQSHRNsuimm", SDTARMVSHXimm>; -def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>; -def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>; -def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>; +def NEONvqrshrnsImm : SDNode<"ARMISD::VQRSHRNsimm", SDTARMVSHXimm>; +def NEONvqrshrnuImm : SDNode<"ARMISD::VQRSHRNuimm", SDTARMVSHXimm>; +def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuimm", SDTARMVSHXimm>; -def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; -def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; +def NEONvsliImm : SDNode<"ARMISD::VSLIimm", SDTARMVSHINSimm>; +def NEONvsriImm : SDNode<"ARMISD::VSRIimm", SDTARMVSHINSimm>; def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; @@ -4097,72 +4092,72 @@ string OpcodeStr> { // 64-bit vector types. def v8i8 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsliImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsliImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsliImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns; + N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsliImm>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsliImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsliImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsliImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns; + N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsliImm>; // imm6 = xxxxxx } multiclass N2VShInsR_QHSD op11_8, bit op4, string OpcodeStr> { // 64-bit vector types. def v8i8 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsriImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsriImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsriImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns; + N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsriImm>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsriImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsriImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsriImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns; + N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsriImm>; // imm6 = xxxxxx } @@ -4262,11 +4257,11 @@ int_arm_neon_vraddhn, 1>; let Predicates = [HasNEON] in { -def : Pat<(v8i8 (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), +def : Pat<(v8i8 (trunc (ARMvshruImm (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), +def : Pat<(v4i16 (trunc (ARMvshruImm (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), +def : Pat<(v2i32 (trunc (ARMvshruImm (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>; } @@ -5020,11 +5015,11 @@ int_arm_neon_vrsubhn, 0>; let Predicates = [HasNEON] in { -def : Pat<(v8i8 (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), +def : Pat<(v8i8 (trunc (ARMvshruImm (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), +def : Pat<(v4i16 (trunc (ARMvshruImm (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), +def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>; } @@ -5515,7 +5510,7 @@ def abd_shr : PatFrag<(ops node:$in1, node:$in2, node:$shift), - (NEONvshrs (sub (zext node:$in1), + (ARMvshrsImm (sub (zext node:$in1), (zext node:$in2)), (i32 $shift))>; let Predicates = [HasNEON] in { @@ -5782,20 +5777,57 @@ IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu>; +let Predicates = [HasNEON] in { +def : Pat<(v8i8 (ARMvshls (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))), + (VSHLsv8i8 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v4i16 (ARMvshls (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))), + (VSHLsv4i16 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v2i32 (ARMvshls (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))), + (VSHLsv2i32 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v1i64 (ARMvshls (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))), + (VSHLsv1i64 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v16i8 (ARMvshls (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))), + (VSHLsv16i8 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v8i16 (ARMvshls (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))), + (VSHLsv8i16 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v4i32 (ARMvshls (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))), + (VSHLsv4i32 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v2i64 (ARMvshls (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))), + (VSHLsv2i64 QPR:$Dn, QPR:$Dm)>; + +def : Pat<(v8i8 (ARMvshlu (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))), + (VSHLuv8i8 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v4i16 (ARMvshlu (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))), + (VSHLuv4i16 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v2i32 (ARMvshlu (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))), + (VSHLuv2i32 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v1i64 (ARMvshlu (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))), + (VSHLuv1i64 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v16i8 (ARMvshlu (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))), + (VSHLuv16i8 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v8i16 (ARMvshlu (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))), + (VSHLuv8i16 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v4i32 (ARMvshlu (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))), + (VSHLuv4i32 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v2i64 (ARMvshlu (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))), + (VSHLuv2i64 QPR:$Dn, QPR:$Dm)>; + +} + // VSHL : Vector Shift Left (Immediate) -defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; +defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", ARMvshlImm>; // VSHR : Vector Shift Right (Immediate) defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs", - NEONvshrs>; + ARMvshrsImm>; defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu", - NEONvshru>; + ARMvshruImm>; // VSHLL : Vector Shift Left Long defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", - PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>; + PatFrag<(ops node:$LHS, node:$RHS), (ARMvshlImm (sext node:$LHS), node:$RHS)>>; defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", - PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>; + PatFrag<(ops node:$LHS, node:$RHS), (ARMvshlImm (zext node:$LHS), node:$RHS)>>; // VSHLL : Vector Shift Left Long (with maximum shift count) class N2VLShMax op21_16, bits<4> op11_8, bit op7, @@ -5814,37 +5846,37 @@ v2i64, v2i32, imm32>; let Predicates = [HasNEON] in { -def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (ARMvshlImm (zext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (ARMvshlImm (zext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (ARMvshlImm (zext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; -def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (ARMvshlImm (sext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (ARMvshlImm (sext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (ARMvshlImm (sext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; -def : Pat<(v8i16 (NEONvshl (anyext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (ARMvshlImm (anyext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (anyext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (ARMvshlImm (anyext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (anyext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (ARMvshlImm (anyext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; } // VSHRN : Vector Shift Right and Narrow defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", PatFrag<(ops node:$Rn, node:$amt), - (trunc (NEONvshrs node:$Rn, node:$amt))>>; + (trunc (ARMvshrsImm node:$Rn, node:$amt))>>; let Predicates = [HasNEON] in { -def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))), +def : Pat<(v8i8 (trunc (ARMvshruImm (v8i16 QPR:$Vn), shr_imm8:$amt))), (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>; -def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))), +def : Pat<(v4i16 (trunc (ARMvshruImm (v4i32 QPR:$Vn), shr_imm16:$amt))), (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>; -def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))), +def : Pat<(v2i32 (trunc (ARMvshruImm (v2i64 QPR:$Vn), shr_imm32:$amt))), (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>; } @@ -5857,13 +5889,13 @@ "vrshl", "u", int_arm_neon_vrshiftu>; // VRSHR : Vector Rounding Shift Right defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs", - NEONvrshrs>; + NEONvrshrsImm>; defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu", - NEONvrshru>; + NEONvrshruImm>; // VRSHRN : Vector Rounding Shift Right and Narrow defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", - NEONvrshrn>; + NEONvrshrnImm>; // VQSHL : Vector Saturating Shift defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm, @@ -5873,21 +5905,21 @@ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu>; // VQSHL : Vector Saturating Shift Left (Immediate) -defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>; -defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>; +defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshlsImm>; +defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshluImm>; // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) -defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>; +defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsuImm>; // VQSHRN : Vector Saturating Shift Right and Narrow defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", - NEONvqshrns>; + NEONvqshrnsImm>; defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u", - NEONvqshrnu>; + NEONvqshrnuImm>; // VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", - NEONvqshrnsu>; + NEONvqshrnsuImm>; // VQRSHL : Vector Saturating Rounding Shift defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm, @@ -5899,20 +5931,20 @@ // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", - NEONvqrshrns>; + NEONvqrshrnsImm>; defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u", - NEONvqrshrnu>; + NEONvqrshrnuImm>; // VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s", - NEONvqrshrnsu>; + NEONvqrshrnsuImm>; // VSRA : Vector Shift Right and Accumulate -defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>; -defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>; +defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", ARMvshrsImm>; +defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", ARMvshruImm>; // VRSRA : Vector Rounding Shift Right and Accumulate -defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; -defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; +defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrsImm>; +defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshruImm>; // VSLI : Vector Shift Left and Insert defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">; Index: llvm/test/CodeGen/ARM/vpadd.ll =================================================================== --- llvm/test/CodeGen/ARM/vpadd.ll +++ llvm/test/CodeGen/ARM/vpadd.ll @@ -285,17 +285,14 @@ define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDL_s8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i16 d16, #0x8 -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vext.8 d17, d18, d16, #1 -; CHECK-NEXT: vneg.s16 d16, d16 -; CHECK-NEXT: vshl.i16 d18, d18, #8 -; CHECK-NEXT: vshl.i16 d17, d17, #8 -; CHECK-NEXT: vshl.s16 d18, d18, d16 -; CHECK-NEXT: vshl.s16 d16, d17, d16 -; CHECK-NEXT: vadd.i16 d16, d16, d18 -; CHECK-NEXT: vstr d16, [r1] -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vext.8 d18, d16, d16, #1 +; CHECK-NEXT: vshl.i16 d16, d16, #8 +; CHECK-NEXT: vshl.i16 d18, d18, #8 +; CHECK-NEXT: vshr.s16 d17, d18, #8 +; CHECK-NEXT: vsra.s16 d17, d16, #8 +; CHECK-NEXT: vstr d17, [r1] +; CHECK-NEXT: mov pc, lr %tmp = load <16 x i8>, <16 x i8>* %cbcr %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> @@ -488,7 +485,19 @@ ; And <2 x i8> to <2 x i32> define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) { ; CHECK-LABEL: fromExtendingExtractVectorElt_2i8: -; CHECK: vadd.i32 +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u8 r1, d16[1] +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: vmov.u8 r2, d16[2] +; CHECK-NEXT: vmov.u8 r3, d16[3] +; CHECK-NEXT: vmov.32 d17[0], r1 +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: vmov.32 d17[1], r3 +; CHECK-NEXT: vmov.32 d16[1], r2 +; CHECK-NEXT: vadd.i32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> %x = add <2 x i8> %tmp2, %tmp1 @@ -497,7 +506,19 @@ define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) { ; CHECK-LABEL: fromExtendingExtractVectorElt_2i16: -; CHECK: vadd.i32 +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.u16 r1, d16[1] +; CHECK-NEXT: vmov.u16 r3, d16[3] +; CHECK-NEXT: vmov.u16 r2, d16[2] +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: vmov.32 d17[0], r1 +; CHECK-NEXT: vmov.32 d16[1], r2 +; CHECK-NEXT: vmov.32 d17[1], r3 +; CHECK-NEXT: vadd.i32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> %x = add <2 x i16> %tmp2, %tmp1 Index: llvm/test/CodeGen/ARM/vuzp.ll =================================================================== --- llvm/test/CodeGen/ARM/vuzp.ll +++ llvm/test/CodeGen/ARM/vuzp.ll @@ -360,16 +360,14 @@ ; CHECK-NEXT: vld1.64 {d18, d19}, [lr] ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] -; CHECK-NEXT: vmov.i8 d19, #0x7 -; CHECK-NEXT: vmovl.u8 q10, d18 +; CHECK-NEXT: vmovl.u8 q9, d18 ; CHECK-NEXT: vmovn.i32 d16, q8 -; CHECK-NEXT: vneg.s8 d17, d19 -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vuzp.8 d16, d20 +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vuzp.8 d16, d18 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr @@ -392,15 +390,13 @@ ; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vcgt.u32 q8, q9, q8 -; CHECK-NEXT: vmov.i8 d18, #0x7 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vneg.s8 d17, d18 +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { @@ -421,15 +417,13 @@ ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vldr d18, .LCPI22_0 -; CHECK-NEXT: vmov.i8 d19, #0x7 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vtbl.8 d16, {d16}, d18 -; CHECK-NEXT: vneg.s8 d17, d19 -; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 3 @@ -487,20 +481,18 @@ ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vmovn.i32 d19, q10 ; CHECK-NEXT: vmov.u8 lr, d23[3] -; CHECK-NEXT: vldr d20, .LCPI23_0 ; CHECK-NEXT: vmovn.i32 d18, q8 ; CHECK-NEXT: vmovn.i16 d22, q9 -; CHECK-NEXT: vmov.i8 q9, #0x7 -; CHECK-NEXT: vneg.s8 q9, q9 +; CHECK-NEXT: vldr d18, .LCPI23_0 ; CHECK-NEXT: vmov.8 d17[0], lr -; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d20 +; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d18 +; CHECK-NEXT: vmov d19, r2, r3 ; CHECK-NEXT: vld1.8 {d17[1]}, [r4] ; CHECK-NEXT: add r4, sp, #8 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vshl.i8 q8, q8, #7 ; CHECK-NEXT: vld1.64 {d20, d21}, [r4] -; CHECK-NEXT: vshl.s8 q8, q8, q9 -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vshr.s8 q8, q8, #7 ; CHECK-NEXT: vbsl q8, q9, q10 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 Index: llvm/test/CodeGen/Thumb2/mve-shifts.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-shifts.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @shl_qq_int8_t(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: shl_qq_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.u8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = shl <16 x i8> %src1, %src2 + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shl_qq_int16_t(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: shl_qq_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.u16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = shl <8 x i16> %src1, %src2 + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shl_qq_int32_t(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: shl_qq_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.u32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = shl <4 x i32> %src1, %src2 + ret <4 x i32> %0 +} + + +define arm_aapcs_vfpcc <16 x i8> @shru_qq_int8_t(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: shru_qq_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vneg.s8 q1, q1 +; CHECK-NEXT: vshl.u8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr <16 x i8> %src1, %src2 + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shru_qq_int16_t(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: shru_qq_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vneg.s16 q1, q1 +; CHECK-NEXT: vshl.u16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr <8 x i16> %src1, %src2 + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shru_qq_int32_t(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: shru_qq_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vneg.s32 q1, q1 +; CHECK-NEXT: vshl.u32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr <4 x i32> %src1, %src2 + ret <4 x i32> %0 +} + + +define arm_aapcs_vfpcc <16 x i8> @shrs_qq_int8_t(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: shrs_qq_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vneg.s8 q1, q1 +; CHECK-NEXT: vshl.s8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = ashr <16 x i8> %src1, %src2 + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shrs_qq_int16_t(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: shrs_qq_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vneg.s16 q1, q1 +; CHECK-NEXT: vshl.s16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = ashr <8 x i16> %src1, %src2 + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shrs_qq_int32_t(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: shrs_qq_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vneg.s32 q1, q1 +; CHECK-NEXT: vshl.s32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = ashr <4 x i32> %src1, %src2 + ret <4 x i32> %0 +} + + +define arm_aapcs_vfpcc <16 x i8> @shl_qi_int8_t(<16 x i8> %src1) { +; CHECK-LABEL: shl_qi_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.i8 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = shl <16 x i8> %src1, + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shl_qi_int16_t(<8 x i16> %src1) { +; CHECK-LABEL: shl_qi_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.i16 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = shl <8 x i16> %src1, + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shl_qi_int32_t(<4 x i32> %src1) { +; CHECK-LABEL: shl_qi_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.i32 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = shl <4 x i32> %src1, + ret <4 x i32> %0 +} + + +define arm_aapcs_vfpcc <16 x i8> @shru_qi_int8_t(<16 x i8> %src1) { +; CHECK-LABEL: shru_qi_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshr.u8 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = lshr <16 x i8> %src1, + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shru_qi_int16_t(<8 x i16> %src1) { +; CHECK-LABEL: shru_qi_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshr.u16 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = lshr <8 x i16> %src1, + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shru_qi_int32_t(<4 x i32> %src1) { +; CHECK-LABEL: shru_qi_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshr.u32 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = lshr <4 x i32> %src1, + ret <4 x i32> %0 +} + + +define arm_aapcs_vfpcc <16 x i8> @shrs_qi_int8_t(<16 x i8> %src1) { +; CHECK-LABEL: shrs_qi_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshr.s8 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = ashr <16 x i8> %src1, + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shrs_qi_int16_t(<8 x i16> %src1) { +; CHECK-LABEL: shrs_qi_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshr.s16 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = ashr <8 x i16> %src1, + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shrs_qi_int32_t(<4 x i32> %src1) { +; CHECK-LABEL: shrs_qi_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshr.s32 q0, q0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = ashr <4 x i32> %src1, + ret <4 x i32> %0 +} + + + +define arm_aapcs_vfpcc <16 x i8> @shl_qr_int8_t(<16 x i8> %src1, i8 %src2) { +; CHECK-LABEL: shl_qr_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.u8 q0, r0 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <16 x i8> undef, i8 %src2, i32 0 + %s = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = shl <16 x i8> %src1, %s + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shl_qr_int16_t(<8 x i16> %src1, i16 %src2) { +; CHECK-LABEL: shl_qr_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.u16 q0, r0 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <8 x i16> undef, i16 %src2, i32 0 + %s = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = shl <8 x i16> %src1, %s + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shl_qr_int32_t(<4 x i32> %src1, i32 %src2) { +; CHECK-LABEL: shl_qr_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshl.u32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <4 x i32> undef, i32 %src2, i32 0 + %s = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = shl <4 x i32> %src1, %s + ret <4 x i32> %0 +} + + +define arm_aapcs_vfpcc <16 x i8> @shru_qr_int8_t(<16 x i8> %src1, i8 %src2) { +; CHECK-LABEL: shru_qr_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.8 q1, r0 +; CHECK-NEXT: vneg.s8 q1, q1 +; CHECK-NEXT: vshl.u8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <16 x i8> undef, i8 %src2, i32 0 + %s = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = lshr <16 x i8> %src1, %s + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shru_qr_int16_t(<8 x i16> %src1, i16 %src2) { +; CHECK-LABEL: shru_qr_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.16 q1, r0 +; CHECK-NEXT: vneg.s16 q1, q1 +; CHECK-NEXT: vshl.u16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <8 x i16> undef, i16 %src2, i32 0 + %s = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = lshr <8 x i16> %src1, %s + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shru_qr_int32_t(<4 x i32> %src1, i32 %src2) { +; CHECK-LABEL: shru_qr_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vneg.s32 q1, q1 +; CHECK-NEXT: vshl.u32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <4 x i32> undef, i32 %src2, i32 0 + %s = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = lshr <4 x i32> %src1, %s + ret <4 x i32> %0 +} + + +define arm_aapcs_vfpcc <16 x i8> @shrs_qr_int8_t(<16 x i8> %src1, i8 %src2) { +; CHECK-LABEL: shrs_qr_int8_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.8 q1, r0 +; CHECK-NEXT: vneg.s8 q1, q1 +; CHECK-NEXT: vshl.s8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <16 x i8> undef, i8 %src2, i32 0 + %s = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = ashr <16 x i8> %src1, %s + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @shrs_qr_int16_t(<8 x i16> %src1, i16 %src2) { +; CHECK-LABEL: shrs_qr_int16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.16 q1, r0 +; CHECK-NEXT: vneg.s16 q1, q1 +; CHECK-NEXT: vshl.s16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <8 x i16> undef, i16 %src2, i32 0 + %s = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = ashr <8 x i16> %src1, %s + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @shrs_qr_int32_t(<4 x i32> %src1, i32 %src2) { +; CHECK-LABEL: shrs_qr_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vneg.s32 q1, q1 +; CHECK-NEXT: vshl.s32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <4 x i32> undef, i32 %src2, i32 0 + %s = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = ashr <4 x i32> %src1, %s + ret <4 x i32> %0 +} + +