Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -139,32 +139,36 @@ VCGTU, // Vector compare unsigned greater than. VTST, // Vector test bits. + // Vector shift by vector + VSHLs, // ...left/right by signed + VSHLu, // ...left/right by unsigned + // Vector shift by immediate: - VSHL, // ...left - VSHRs, // ...right (signed) - VSHRu, // ...right (unsigned) + VSHLIMM, // ...left + VSHRsIMM, // ...right (signed) + VSHRuIMM, // ...right (unsigned) // Vector rounding shift by immediate: - VRSHRs, // ...right (signed) - VRSHRu, // ...right (unsigned) - VRSHRN, // ...right narrow + VRSHRsIMM, // ...right (signed) + VRSHRuIMM, // ...right (unsigned) + VRSHRNIMM, // ...right narrow // Vector saturating shift by immediate: - VQSHLs, // ...left (signed) - VQSHLu, // ...left (unsigned) - VQSHLsu, // ...left (signed to unsigned) - VQSHRNs, // ...right narrow (signed) - VQSHRNu, // ...right narrow (unsigned) - VQSHRNsu, // ...right narrow (signed to unsigned) + VQSHLsIMM, // ...left (signed) + VQSHLuIMM, // ...left (unsigned) + VQSHLsuIMM, // ...left (signed to unsigned) + VQSHRNsIMM, // ...right narrow (signed) + VQSHRNuIMM, // ...right narrow (unsigned) + VQSHRNsuIMM, // ...right narrow (signed to unsigned) // Vector saturating rounding shift by immediate: - VQRSHRNs, // ...right narrow (signed) - VQRSHRNu, // ...right narrow (unsigned) - VQRSHRNsu, // ...right narrow (signed to unsigned) + VQRSHRNsIMM, // ...right narrow (signed) + VQRSHRNuIMM, // ...right narrow (unsigned) + VQRSHRNsuIMM, // ...right narrow (signed to unsigned) // Vector shift and insert: - VSLI, // ...left - VSRI, // ...right + VSLIIMM, // ...left + VSRIIMM, // ...right // Vector get lane (VMOV scalar to ARM core register) // (These are used for 8- and 16-bit element types only.) Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1501,23 +1501,25 @@ case ARMISD::VCGTU: return "ARMISD::VCGTU"; case ARMISD::VTST: return "ARMISD::VTST"; - case ARMISD::VSHL: return "ARMISD::VSHL"; - case ARMISD::VSHRs: return "ARMISD::VSHRs"; - case ARMISD::VSHRu: return "ARMISD::VSHRu"; - case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; - case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; - case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; - case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; - case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; - case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; - case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; - case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; - case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; - case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; - case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; - case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; - case ARMISD::VSLI: return "ARMISD::VSLI"; - case ARMISD::VSRI: return "ARMISD::VSRI"; + case ARMISD::VSHLs: return "ARMISD::VSHLs"; + case ARMISD::VSHLu: return "ARMISD::VSHLu"; + case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; + case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; + case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; + case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; + case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; + case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; + case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; + case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; + case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; + case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; + case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; + case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; + case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; + case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; + case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; + case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; + case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; @@ -5136,7 +5138,7 @@ DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) - Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ @@ -5144,11 +5146,11 @@ if (SrcVT == MVT::f32) { Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); if (VT == MVT::f64) - Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) - Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); @@ -5653,40 +5655,99 @@ return Res; } +/// Getvshiftimm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || + !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { + Cnt = -Cnt; + return true; + } + return false; +} + static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); + int64_t Cnt; if (!VT.isVector()) return SDValue(); - // Lower vector shifts on NEON to use VSHL. - assert(ST->hasNEON() && "unexpected vector shift"); + // We essentially have two forms here. Shift by an immediate and shift by a + // vector register. We cannot easily match shift by an immediate in tablegen + // so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. For shifting + // by a vector, we don't have VSHR, only VSHL (which can be signed or + // unsigned, and a negative shift indicates a shift right). + if (N->getOpcode() == ISD::SHL) { + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) + return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), + N->getOperand(1)); + } - // Left shifts translate directly to the vshiftu intrinsic. - if (N->getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, - MVT::i32), - N->getOperand(0), N->getOperand(1)); + assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && + "unexpected vector shift opcode"); - assert((N->getOpcode() == ISD::SRA || - N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + } - // NEON uses the same intrinsics for both left and right shifts. For - // right shifts, the shift amounts are negative, so negate the vector of - // shift amounts. + // Other right shifts we don't have operations for (we use a shift left by a + // negative number). EVT ShiftVT = N->getOperand(1).getValueType(); - SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, - getZeroVector(ShiftVT, DAG, dl), - N->getOperand(1)); - Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? - Intrinsic::arm_neon_vshifts : - Intrinsic::arm_neon_vshiftu); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(vshiftInt, dl, MVT::i32), - N->getOperand(0), NegatedCount); + SDValue NegatedCount = DAG.getNode( + ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); } static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, @@ -12574,58 +12635,6 @@ ConvInput, DAG.getConstant(C, dl, MVT::i32)); } -/// Getvshiftimm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift operation, where all the elements of the -/// build_vector must have the same constant integer value. -static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { - // Ignore bit_converts. - while (Op.getOpcode() == ISD::BITCAST) - Op = Op.getOperand(0); - BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ElementBits) || - SplatBitSize > ElementBits) - return false; - Cnt = SplatBits.getSExtValue(); - return true; -} - -/// isVShiftLImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift left operation. That value must be in the range: -/// 0 <= Value < ElementBits for a left shift; or -/// 0 <= Value <= ElementBits for a long left shift. -static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); -} - -/// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - if (!isIntrinsic) - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); - if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { - Cnt = -Cnt; - return true; - } - return false; -} - /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); @@ -12661,12 +12670,12 @@ case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { - VShiftOpc = ARMISD::VSHL; + VShiftOpc = ARMISD::VSHLIMM; break; } if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { - VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? - ARMISD::VSHRs : ARMISD::VSHRu); + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM + : ARMISD::VSHRuIMM); break; } return SDValue(); @@ -12711,29 +12720,41 @@ // Opcode already set above. break; case Intrinsic::arm_neon_vrshifts: - VShiftOpc = ARMISD::VRSHRs; break; + VShiftOpc = ARMISD::VRSHRsIMM; + break; case Intrinsic::arm_neon_vrshiftu: - VShiftOpc = ARMISD::VRSHRu; break; + VShiftOpc = ARMISD::VRSHRuIMM; + break; case Intrinsic::arm_neon_vrshiftn: - VShiftOpc = ARMISD::VRSHRN; break; + VShiftOpc = ARMISD::VRSHRNIMM; + break; case Intrinsic::arm_neon_vqshifts: - VShiftOpc = ARMISD::VQSHLs; break; + VShiftOpc = ARMISD::VQSHLsIMM; + break; case Intrinsic::arm_neon_vqshiftu: - VShiftOpc = ARMISD::VQSHLu; break; + VShiftOpc = ARMISD::VQSHLuIMM; + break; case Intrinsic::arm_neon_vqshiftsu: - VShiftOpc = ARMISD::VQSHLsu; break; + VShiftOpc = ARMISD::VQSHLsuIMM; + break; case Intrinsic::arm_neon_vqshiftns: - VShiftOpc = ARMISD::VQSHRNs; break; + VShiftOpc = ARMISD::VQSHRNsIMM; + break; case Intrinsic::arm_neon_vqshiftnu: - VShiftOpc = ARMISD::VQSHRNu; break; + VShiftOpc = ARMISD::VQSHRNuIMM; + break; case Intrinsic::arm_neon_vqshiftnsu: - VShiftOpc = ARMISD::VQSHRNsu; break; + VShiftOpc = ARMISD::VQSHRNsuIMM; + break; case Intrinsic::arm_neon_vqrshiftns: - VShiftOpc = ARMISD::VQRSHRNs; break; + VShiftOpc = ARMISD::VQRSHRNsIMM; + break; case Intrinsic::arm_neon_vqrshiftnu: - VShiftOpc = ARMISD::VQRSHRNu; break; + VShiftOpc = ARMISD::VQRSHRNuIMM; + break; case Intrinsic::arm_neon_vqrshiftnsu: - VShiftOpc = ARMISD::VQRSHRNsu; break; + VShiftOpc = ARMISD::VQRSHRNsuIMM; + break; } SDLoc dl(N); @@ -12747,9 +12768,9 @@ unsigned VShiftOpc = 0; if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) - VShiftOpc = ARMISD::VSLI; + VShiftOpc = ARMISD::VSLIIMM; else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) - VShiftOpc = ARMISD::VSRI; + VShiftOpc = ARMISD::VSRIIMM; else { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } @@ -12840,7 +12861,7 @@ case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { SDLoc dl(N); - return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), + return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } break; @@ -12848,8 +12869,8 @@ case ISD::SRA: case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { - unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? - ARMISD::VSHRs : ARMISD::VSHRu); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); @@ -13619,7 +13640,7 @@ SDNode *U = *ExtVal->use_begin(); if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || - U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) + U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) return false; return true; Index: llvm/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrNEON.td +++ llvm/lib/Target/ARM/ARMInstrNEON.td @@ -493,38 +493,45 @@ def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; +// Vector Shifts +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>,]>; + +def NEONvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; +def NEONvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; + // Types for vector shift by immediates. The "SHX" version is for long and // narrow operations where the source and destination vectors have different // types. The "SHINS" version is for shift and insert operations. -def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisVT<2, i32>]>; -def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, - SDTCisVT<2, i32>]>; -def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; - -def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>; -def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>; -def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>; -def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>; - -def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>; -def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>; -def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>; - -def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>; -def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>; -def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>; -def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>; -def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>; -def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>; - -def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>; -def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>; -def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>; - -def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; -def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; +def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHXIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHINSIMM : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; + +def NEONvshlImm : SDNode<"ARMISD::VSHLIMM", SDTARMVSHIMM>; +def NEONvshrsImm : SDNode<"ARMISD::VSHRsIMM", SDTARMVSHIMM>; +def NEONvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>; +def NEONvshrnImm : SDNode<"ARMISD::VSHRNIMM", SDTARMVSHXIMM>; + +def NEONvrshrsImm : SDNode<"ARMISD::VRSHRsIMM", SDTARMVSHIMM>; +def NEONvrshruImm : SDNode<"ARMISD::VRSHRuIMM", SDTARMVSHIMM>; +def NEONvrshrnImm : SDNode<"ARMISD::VRSHRNIMM", SDTARMVSHXIMM>; + +def NEONvqshlsImm : SDNode<"ARMISD::VQSHLsIMM", SDTARMVSHIMM>; +def NEONvqshluImm : SDNode<"ARMISD::VQSHLuIMM", SDTARMVSHIMM>; +def NEONvqshlsuImm : SDNode<"ARMISD::VQSHLsuIMM", SDTARMVSHIMM>; +def NEONvqshrnsImm : SDNode<"ARMISD::VQSHRNsIMM", SDTARMVSHXIMM>; +def NEONvqshrnuImm : SDNode<"ARMISD::VQSHRNuIMM", SDTARMVSHXIMM>; +def NEONvqshrnsuImm : SDNode<"ARMISD::VQSHRNsuIMM", SDTARMVSHXIMM>; + +def NEONvqrshrnsImm : SDNode<"ARMISD::VQRSHRNsIMM", SDTARMVSHXIMM>; +def NEONvqrshrnuImm : SDNode<"ARMISD::VQRSHRNuIMM", SDTARMVSHXIMM>; +def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>; + +def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>; +def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>; def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; @@ -4097,72 +4104,72 @@ string OpcodeStr> { // 64-bit vector types. def v8i8 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsliImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsliImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns { + N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsliImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns; + N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsliImm>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsliImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsliImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns { + N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsliImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns; + N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsliImm>; // imm6 = xxxxxx } multiclass N2VShInsR_QHSD op11_8, bit op4, string OpcodeStr> { // 64-bit vector types. def v8i8 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsriImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsriImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns { + N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsriImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns; + N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsriImm>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsriImm> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsriImm> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns { + N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsriImm> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns; + N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsriImm>; // imm6 = xxxxxx } @@ -4262,11 +4269,11 @@ int_arm_neon_vraddhn, 1>; let Predicates = [HasNEON] in { -def : Pat<(v8i8 (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), +def : Pat<(v8i8 (trunc (NEONvshruImm (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), +def : Pat<(v4i16 (trunc (NEONvshruImm (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), +def : Pat<(v2i32 (trunc (NEONvshruImm (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>; } @@ -5020,11 +5027,11 @@ int_arm_neon_vrsubhn, 0>; let Predicates = [HasNEON] in { -def : Pat<(v8i8 (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), +def : Pat<(v8i8 (trunc (NEONvshruImm (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), +def : Pat<(v4i16 (trunc (NEONvshruImm (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>; -def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), +def : Pat<(v2i32 (trunc (NEONvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>; } @@ -5515,7 +5522,7 @@ def abd_shr : PatFrag<(ops node:$in1, node:$in2, node:$shift), - (NEONvshrs (sub (zext node:$in1), + (NEONvshrsImm (sub (zext node:$in1), (zext node:$in2)), (i32 $shift))>; let Predicates = [HasNEON] in { @@ -5782,20 +5789,57 @@ IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu>; +let Predicates = [HasNEON] in { +def : Pat<(v8i8 (NEONvshls (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))), + (VSHLsv8i8 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v4i16 (NEONvshls (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))), + (VSHLsv4i16 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v2i32 (NEONvshls (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))), + (VSHLsv2i32 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v1i64 (NEONvshls (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))), + (VSHLsv1i64 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v16i8 (NEONvshls (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))), + (VSHLsv16i8 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v8i16 (NEONvshls (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))), + (VSHLsv8i16 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v4i32 (NEONvshls (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))), + (VSHLsv4i32 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v2i64 (NEONvshls (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))), + (VSHLsv2i64 QPR:$Dn, QPR:$Dm)>; + +def : Pat<(v8i8 (NEONvshlu (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))), + (VSHLuv8i8 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v4i16 (NEONvshlu (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))), + (VSHLuv4i16 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v2i32 (NEONvshlu (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))), + (VSHLuv2i32 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v1i64 (NEONvshlu (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))), + (VSHLuv1i64 DPR:$Dn, DPR:$Dm)>; +def : Pat<(v16i8 (NEONvshlu (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))), + (VSHLuv16i8 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v8i16 (NEONvshlu (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))), + (VSHLuv8i16 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v4i32 (NEONvshlu (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))), + (VSHLuv4i32 QPR:$Dn, QPR:$Dm)>; +def : Pat<(v2i64 (NEONvshlu (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))), + (VSHLuv2i64 QPR:$Dn, QPR:$Dm)>; + +} + // VSHL : Vector Shift Left (Immediate) -defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; +defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshlImm>; // VSHR : Vector Shift Right (Immediate) defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs", - NEONvshrs>; + NEONvshrsImm>; defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu", - NEONvshru>; + NEONvshruImm>; // VSHLL : Vector Shift Left Long defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", - PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>; + PatFrag<(ops node:$LHS, node:$RHS), (NEONvshlImm (sext node:$LHS), node:$RHS)>>; defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", - PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>; + PatFrag<(ops node:$LHS, node:$RHS), (NEONvshlImm (zext node:$LHS), node:$RHS)>>; // VSHLL : Vector Shift Left Long (with maximum shift count) class N2VLShMax op21_16, bits<4> op11_8, bit op7, @@ -5814,37 +5858,37 @@ v2i64, v2i32, imm32>; let Predicates = [HasNEON] in { -def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (NEONvshlImm (zext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (NEONvshlImm (zext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (NEONvshlImm (zext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; -def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (NEONvshlImm (sext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (NEONvshlImm (sext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (NEONvshlImm (sext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; -def : Pat<(v8i16 (NEONvshl (anyext (v8i8 DPR:$Rn)), (i32 8))), +def : Pat<(v8i16 (NEONvshlImm (anyext (v8i8 DPR:$Rn)), (i32 8))), (VSHLLi8 DPR:$Rn, 8)>; -def : Pat<(v4i32 (NEONvshl (anyext (v4i16 DPR:$Rn)), (i32 16))), +def : Pat<(v4i32 (NEONvshlImm (anyext (v4i16 DPR:$Rn)), (i32 16))), (VSHLLi16 DPR:$Rn, 16)>; -def : Pat<(v2i64 (NEONvshl (anyext (v2i32 DPR:$Rn)), (i32 32))), +def : Pat<(v2i64 (NEONvshlImm (anyext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; } // VSHRN : Vector Shift Right and Narrow defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", PatFrag<(ops node:$Rn, node:$amt), - (trunc (NEONvshrs node:$Rn, node:$amt))>>; + (trunc (NEONvshrsImm node:$Rn, node:$amt))>>; let Predicates = [HasNEON] in { -def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))), +def : Pat<(v8i8 (trunc (NEONvshruImm (v8i16 QPR:$Vn), shr_imm8:$amt))), (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>; -def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))), +def : Pat<(v4i16 (trunc (NEONvshruImm (v4i32 QPR:$Vn), shr_imm16:$amt))), (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>; -def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))), +def : Pat<(v2i32 (trunc (NEONvshruImm (v2i64 QPR:$Vn), shr_imm32:$amt))), (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>; } @@ -5857,13 +5901,13 @@ "vrshl", "u", int_arm_neon_vrshiftu>; // VRSHR : Vector Rounding Shift Right defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs", - NEONvrshrs>; + NEONvrshrsImm>; defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu", - NEONvrshru>; + NEONvrshruImm>; // VRSHRN : Vector Rounding Shift Right and Narrow defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", - NEONvrshrn>; + NEONvrshrnImm>; // VQSHL : Vector Saturating Shift defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm, @@ -5873,21 +5917,21 @@ IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu>; // VQSHL : Vector Saturating Shift Left (Immediate) -defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>; -defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>; +defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshlsImm>; +defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshluImm>; // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) -defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>; +defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsuImm>; // VQSHRN : Vector Saturating Shift Right and Narrow defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", - NEONvqshrns>; + NEONvqshrnsImm>; defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u", - NEONvqshrnu>; + NEONvqshrnuImm>; // VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", - NEONvqshrnsu>; + NEONvqshrnsuImm>; // VQRSHL : Vector Saturating Rounding Shift defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm, @@ -5899,20 +5943,20 @@ // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", - NEONvqrshrns>; + NEONvqrshrnsImm>; defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u", - NEONvqrshrnu>; + NEONvqrshrnuImm>; // VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s", - NEONvqrshrnsu>; + NEONvqrshrnsuImm>; // VSRA : Vector Shift Right and Accumulate -defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>; -defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>; +defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrsImm>; +defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshruImm>; // VRSRA : Vector Rounding Shift Right and Accumulate -defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; -defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; +defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrsImm>; +defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshruImm>; // VSLI : Vector Shift Left and Insert defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">; Index: llvm/test/CodeGen/ARM/vpadd.ll =================================================================== --- llvm/test/CodeGen/ARM/vpadd.ll +++ llvm/test/CodeGen/ARM/vpadd.ll @@ -285,17 +285,14 @@ define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDL_s8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i16 d16, #0x8 -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vext.8 d17, d18, d16, #1 -; CHECK-NEXT: vneg.s16 d16, d16 -; CHECK-NEXT: vshl.i16 d18, d18, #8 -; CHECK-NEXT: vshl.i16 d17, d17, #8 -; CHECK-NEXT: vshl.s16 d18, d18, d16 -; CHECK-NEXT: vshl.s16 d16, d17, d16 -; CHECK-NEXT: vadd.i16 d16, d16, d18 -; CHECK-NEXT: vstr d16, [r1] -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vext.8 d18, d16, d16, #1 +; CHECK-NEXT: vshl.i16 d16, d16, #8 +; CHECK-NEXT: vshl.i16 d18, d18, #8 +; CHECK-NEXT: vshr.s16 d17, d18, #8 +; CHECK-NEXT: vsra.s16 d17, d16, #8 +; CHECK-NEXT: vstr d17, [r1] +; CHECK-NEXT: mov pc, lr %tmp = load <16 x i8>, <16 x i8>* %cbcr %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> @@ -488,7 +485,19 @@ ; And <2 x i8> to <2 x i32> define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) { ; CHECK-LABEL: fromExtendingExtractVectorElt_2i8: -; CHECK: vadd.i32 +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u8 r1, d16[1] +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: vmov.u8 r2, d16[2] +; CHECK-NEXT: vmov.u8 r3, d16[3] +; CHECK-NEXT: vmov.32 d17[0], r1 +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: vmov.32 d17[1], r3 +; CHECK-NEXT: vmov.32 d16[1], r2 +; CHECK-NEXT: vadd.i32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> %x = add <2 x i8> %tmp2, %tmp1 @@ -497,7 +506,19 @@ define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) { ; CHECK-LABEL: fromExtendingExtractVectorElt_2i16: -; CHECK: vadd.i32 +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.u16 r1, d16[1] +; CHECK-NEXT: vmov.u16 r3, d16[3] +; CHECK-NEXT: vmov.u16 r2, d16[2] +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: vmov.32 d17[0], r1 +; CHECK-NEXT: vmov.32 d16[1], r2 +; CHECK-NEXT: vmov.32 d17[1], r3 +; CHECK-NEXT: vadd.i32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> %x = add <2 x i16> %tmp2, %tmp1 Index: llvm/test/CodeGen/ARM/vuzp.ll =================================================================== --- llvm/test/CodeGen/ARM/vuzp.ll +++ llvm/test/CodeGen/ARM/vuzp.ll @@ -360,16 +360,14 @@ ; CHECK-NEXT: vld1.64 {d18, d19}, [lr] ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] -; CHECK-NEXT: vmov.i8 d19, #0x7 -; CHECK-NEXT: vmovl.u8 q10, d18 +; CHECK-NEXT: vmovl.u8 q9, d18 ; CHECK-NEXT: vmovn.i32 d16, q8 -; CHECK-NEXT: vneg.s8 d17, d19 -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vuzp.8 d16, d20 +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vuzp.8 d16, d18 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr @@ -392,15 +390,13 @@ ; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vcgt.u32 q8, q9, q8 -; CHECK-NEXT: vmov.i8 d18, #0x7 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vneg.s8 d17, d18 +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { @@ -421,15 +417,13 @@ ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vldr d18, .LCPI22_0 -; CHECK-NEXT: vmov.i8 d19, #0x7 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vtbl.8 d16, {d16}, d18 -; CHECK-NEXT: vneg.s8 d17, d19 -; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 3 @@ -487,20 +481,18 @@ ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vmovn.i32 d19, q10 ; CHECK-NEXT: vmov.u8 lr, d23[3] -; CHECK-NEXT: vldr d20, .LCPI23_0 ; CHECK-NEXT: vmovn.i32 d18, q8 ; CHECK-NEXT: vmovn.i16 d22, q9 -; CHECK-NEXT: vmov.i8 q9, #0x7 -; CHECK-NEXT: vneg.s8 q9, q9 +; CHECK-NEXT: vldr d18, .LCPI23_0 ; CHECK-NEXT: vmov.8 d17[0], lr -; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d20 +; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d18 +; CHECK-NEXT: vmov d19, r2, r3 ; CHECK-NEXT: vld1.8 {d17[1]}, [r4] ; CHECK-NEXT: add r4, sp, #8 +; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vshl.i8 q8, q8, #7 ; CHECK-NEXT: vld1.64 {d20, d21}, [r4] -; CHECK-NEXT: vshl.s8 q8, q8, q9 -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vshr.s8 q8, q8, #7 ; CHECK-NEXT: vbsl q8, q9, q10 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17