diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8048,7 +8048,8 @@ if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) return DAG.UnrollVectorOp(Node); - SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC); + EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC); return DAG.getSelect(DL, VT, Cond, Op0, Op1); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -391,6 +391,7 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); } setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); @@ -428,7 +429,7 @@ } // Predicate types - const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; + const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1}; for (auto VT : pTypes) { addRegisterClass(VT, &ARM::VCCRRegClass); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -445,6 +446,16 @@ setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); } + setOperationAction(ISD::SETCC, MVT::v2i1, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand); + setOperationAction(ISD::AND, MVT::v2i1, Expand); + setOperationAction(ISD::OR, MVT::v2i1, Expand); + setOperationAction(ISD::XOR, MVT::v2i1, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); @@ -1853,8 +1864,10 @@ // MVE has a predicate register. if ((Subtarget->hasMVEIntegerOps() && - (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) || - (Subtarget->hasMVEFloatOps() && (VT == MVT::v4f32 || VT == MVT::v8f16))) + (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + VT == MVT::v16i8)) || + (Subtarget->hasMVEFloatOps() && + (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16))) return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -7616,7 +7629,10 @@ unsigned NumElts = VT.getVectorNumElements(); unsigned BoolMask; unsigned BitsPerBool; - if (NumElts == 4) { + if (NumElts == 2) { + BitsPerBool = 8; + BoolMask = 0xff; + } else if (NumElts == 4) { BitsPerBool = 4; BoolMask = 0xf; } else if (NumElts == 8) { @@ -8346,6 +8362,8 @@ static EVT getVectorTyFromPredicateVector(EVT VT) { switch (VT.getSimpleVT().SimpleTy) { + case MVT::v2i1: + return MVT::v2f64; case MVT::v4i1: return MVT::v4i32; case MVT::v8i1: @@ -8427,7 +8445,14 @@ DAG.getUNDEF(NewVT), ShuffleMask); // Now return the result of comparing the shuffled vector with zero, - // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 + // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s. + if (VT == MVT::v2i1) { + SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled); + SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); + } return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } @@ -8927,8 +8952,15 @@ ConVec = ExtractInto(NewV1, ConVec, j); ConVec = ExtractInto(NewV2, ConVec, j); - // Now return the result of comparing the subvector with zero, - // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + // Now return the result of comparing the subvector with zero, which will + // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we + // convert to a v4i1 compare to fill in the two halves of the i64 as i32s. + if (VT == MVT::v2i1) { + SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec); + SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); + } return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); }; @@ -8993,6 +9025,22 @@ MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + if (NumElts == 2) { + EVT SubVT = MVT::v4i32; + SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); + for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, + DAG.getIntPtrConstant(i, dl)); + SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, + DAG.getConstant(j + 1, dl, MVT::i32)); + } + SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); + } + EVT SubVT = MVT::getVectorVT(ElType, NumElts); SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { @@ -9839,16 +9887,17 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { LoadSDNode *LD = cast(Op.getNode()); EVT MemVT = LD->getMemoryVT(); - assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + MemVT == MVT::v16i1) && "Expected a predicate type!"); assert(MemVT == Op.getValueType()); assert(LD->getExtensionType() == ISD::NON_EXTLOAD && "Expected a non-extending load"); assert(LD->isUnindexed() && "Expected a unindexed load"); - // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit + // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We - // need to make sure that 8/4 bits are actually loaded into the correct + // need to make sure that 8/4/2 bits are actually loaded into the correct // place, which means loading the value and then shuffling the values into // the bottom bits of the predicate. // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect @@ -9895,14 +9944,15 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { StoreSDNode *ST = cast(Op.getNode()); EVT MemVT = ST->getMemoryVT(); - assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + MemVT == MVT::v16i1) && "Expected a predicate type!"); assert(MemVT == ST->getValue().getValueType()); assert(!ST->isTruncatingStore() && "Expected a non-extending store"); assert(ST->isUnindexed() && "Expected a unindexed store"); - // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits - // unset and a scalar store. + // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with + // top bits unset and a scalar store. SDLoc dl(Op); SDValue Build = ST->getValue(); if (MemVT != MVT::v16i1) { @@ -9953,7 +10003,7 @@ {ST->getChain(), Lo, Hi, ST->getBasePtr()}, MemVT, ST->getMemOperand()); } else if (Subtarget->hasMVEIntegerOps() && - ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1))) { return LowerPredicateStore(Op, DAG); } @@ -14002,8 +14052,8 @@ EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 || - VT == MVT::v8i1 || VT == MVT::v16i1) + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 || + VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1) return SDValue(); APInt SplatBits, SplatUndef; @@ -14298,8 +14348,8 @@ if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - if (Subtarget->hasMVEIntegerOps() && - (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) + if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 || + VT == MVT::v8i1 || VT == MVT::v16i1)) return PerformORCombine_i1(N, DAG, Subtarget); APInt SplatBits, SplatUndef; @@ -18564,7 +18614,8 @@ return false; // These are for predicates - if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { + if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 || + Ty == MVT::v2i1)) { if (Fast) *Fast = true; return true; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4450,6 +4450,11 @@ (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)), (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))), VCCR))>; + def v2i1 : Pat<(v2i1 (opnode (v2i1 VCCR:$p1), (v2i1 VCCR:$p2))), + (v2i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v2i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v2i1 VCCR:$p2), rGPR))), + VCCR))>; } let Predicates = [HasMVEInt] in { @@ -4469,20 +4474,20 @@ }]>; let Predicates = [HasMVEInt] in { - foreach VT = [ v4i1, v8i1, v16i1 ] in { + foreach VT = [ v2i1, v4i1, v8i1, v16i1 ] in { def : Pat<(i32 (predicate_cast (VT VCCR:$src))), (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>; def : Pat<(VT (predicate_cast (i32 VCCR:$src))), (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>; - foreach VT2 = [ v4i1, v8i1, v16i1 ] in + foreach VT2 = [ v2i1, v4i1, v8i1, v16i1 ] in def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } // If we happen to be casting from a load we can convert that straight // into a predicate load, so long as the load is of the correct type. - foreach VT = [ v4i1, v8i1, v16i1 ] in { + foreach VT = [ v2i1, v4i1, v8i1, v16i1 ] in { def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))), (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>; } @@ -6778,11 +6783,15 @@ (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (vselect (v2i1 VCCR:$pred), (v2i64 MQPR:$v1), (v2i64 MQPR:$v2))), + (v2i64 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2f64 (vselect (v2i1 VCCR:$pred), (v2f64 MQPR:$v1), (v2f64 MQPR:$v2))), + (v2f64 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, @@ -6808,6 +6817,8 @@ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (zext (v2i1 VCCR:$pred))), + (v2i64 (MVE_VPSEL (MVE_VMOVimmi64 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))), (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>; @@ -6815,6 +6826,8 @@ (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (sext (v2i1 VCCR:$pred))), + (v2i64 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))), (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>; @@ -6822,6 +6835,8 @@ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (anyext (v2i1 VCCR:$pred))), + (v2i64 (MVE_VPSEL (MVE_VMOVimmi64 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; } let Predicates = [HasMVEFloat] in { @@ -6862,6 +6877,8 @@ } let Predicates = [HasMVEInt] in { + def : Pat<(v2i1 (xor (v2i1 VCCR:$pred), (v2i1 (predicate_cast (i32 65535))))), + (v2i1 (MVE_VPNOT (v2i1 VCCR:$pred)))>; def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))), (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>; def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))), diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -395,7 +395,7 @@ } // MVE Condition code register. -def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1], 32, (add VPR)> { +def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1, v2i1], 32, (add VPR)> { // let CopyCost = -1; // Don't allow copying of status registers. } diff --git a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll --- a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll @@ -69,7 +69,7 @@ ; ; MVE-RECIP-LABEL: 'sadd' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 218 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 434 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef) @@ -126,7 +126,7 @@ ; ; MVE-SIZE-LABEL: 'sadd' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef) @@ -385,7 +385,7 @@ ; ; MVE-RECIP-LABEL: 'ssub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 218 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 434 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef) @@ -442,7 +442,7 @@ ; ; MVE-SIZE-LABEL: 'ssub' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef) diff --git a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll --- a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll @@ -87,22 +87,22 @@ ; ; MVE-RECIP-LABEL: 'add' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 226 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 450 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 898 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 166 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 594 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -162,22 +162,22 @@ ; ; MVE-SIZE-LABEL: 'add' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -293,22 +293,22 @@ ; ; MVE-RECIP-LABEL: 'sub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 226 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 450 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 898 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 166 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 594 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -368,22 +368,22 @@ ; ; MVE-SIZE-LABEL: 'sub' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll --- a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll @@ -87,22 +87,22 @@ ; ; MVE-RECIP-LABEL: 'add' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -293,22 +293,22 @@ ; ; MVE-RECIP-LABEL: 'sub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll --- a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll @@ -33,22 +33,22 @@ define i32 @abs(i32 %arg) { ; MVE-RECIP-LABEL: 'abs' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false) diff --git a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll --- a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll @@ -35,22 +35,22 @@ define i32 @smin(i32 %arg) { ; MVE-RECIP-LABEL: 'smin' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -142,22 +142,22 @@ define i32 @smax(i32 %arg) { ; MVE-RECIP-LABEL: 'smax' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -250,22 +250,22 @@ define i32 @umin(i32 %arg) { ; MVE-RECIP-LABEL: 'umin' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -357,22 +357,22 @@ define i32 @sub(i32 %arg) { ; MVE-RECIP-LABEL: 'sub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll @@ -24,10 +24,10 @@ ; ; MVE-LABEL: 'reduce_i64' ; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) @@ -56,7 +56,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) @@ -130,7 +130,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll @@ -24,10 +24,10 @@ ; ; MVE-LABEL: 'reduce_i64' ; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) @@ -56,7 +56,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) @@ -130,7 +130,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll @@ -24,10 +24,10 @@ ; ; MVE-LABEL: 'reduce_i64' ; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) @@ -56,7 +56,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) @@ -130,7 +130,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll @@ -24,10 +24,10 @@ ; ; MVE-LABEL: 'reduce_i64' ; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) @@ -56,7 +56,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) @@ -91,7 +91,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) @@ -130,7 +130,7 @@ ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll --- a/llvm/test/Analysis/CostModel/ARM/select.ll +++ b/llvm/test/Analysis/CostModel/ARM/select.ll @@ -20,28 +20,28 @@ ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-RECIP-LABEL: 'selects' diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -4,60 +4,57 @@ define <2 x i64> @v2i64(i32 %index, i32 %TC, <2 x i64> %V1, <2 x i64> %V2) { ; CHECK-LABEL: v2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov q0[2], q0[0], r0, r0 ; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r1 ; CHECK-NEXT: vmov r0, r12, d1 -; CHECK-NEXT: vmov lr, s0 ; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r0 -; CHECK-NEXT: adc r12, r12, #0 +; CHECK-NEXT: adc lr, r12, #0 +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vmov.i32 q2, #0x1 +; CHECK-NEXT: vldr d1, [sp, #16] ; CHECK-NEXT: vmov r1, r6, d3 ; CHECK-NEXT: eors r0, r4 ; CHECK-NEXT: subs r1, r4, r1 ; CHECK-NEXT: sbcs.w r1, r5, r6 -; CHECK-NEXT: vmov r5, r6, d0 +; CHECK-NEXT: vmov r5, r4, d2 ; CHECK-NEXT: cset r1, lo ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov r7, r1, d2 -; CHECK-NEXT: csetm r8, ne -; CHECK-NEXT: subs r7, r5, r7 -; CHECK-NEXT: sbcs.w r1, r6, r1 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: orrs.w r0, r0, r12 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: teq.w r5, lr -; CHECK-NEXT: vmov q0[2], q0[0], r1, r8 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: veor q1, q1, q2 -; CHECK-NEXT: vldr d5, [sp, #24] -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vmov d4, r2, r3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: and r0, r0, #1 -; CHECK-NEXT: and r1, r1, #1 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: orrs.w r0, r0, lr +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vmov r1, r6, d0 ; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: subs r5, r1, r5 +; CHECK-NEXT: sbcs r6, r4 +; CHECK-NEXT: cset r6, lo +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cset r6, ne +; CHECK-NEXT: teq.w r1, r12 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: ands r1, r6 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: bfi r6, r1, #0, #8 +; CHECK-NEXT: bfi r6, r0, #8, #8 +; CHECK-NEXT: add r0, sp, #24 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC) %select = select <2 x i1> %active.lane.mask, <2 x i64> %V1, <2 x i64> %V2 ret <2 x i64> %select @@ -445,102 +442,94 @@ define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) { ; CHECK-LABEL: test_width2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq.w .LBB5_3 +; CHECK-NEXT: beq .LBB5_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: adds r0, r2, #1 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r2 -; CHECK-NEXT: bic r0, r0, #1 -; CHECK-NEXT: adr r2, .LCPI5_0 -; CHECK-NEXT: subs r0, #2 ; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: bic r0, r0, #1 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vldrw.u32 q2, [r2] -; CHECK-NEXT: add.w lr, r3, r0, lsr #1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: subs r0, #2 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: add.w lr, r3, r0, lsr #1 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3[2], q3[0], r8, r8 +; CHECK-NEXT: vmov q2[2], q2[0], r12, r12 ; CHECK-NEXT: vmov r6, r7, d3 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: add.w r8, r8, #2 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: vmov r9, s12 -; CHECK-NEXT: adds r2, #1 -; CHECK-NEXT: vmov q3[2], q3[0], r9, r2 -; CHECK-NEXT: adc r12, r3, #0 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vmov r0, r3, d2 -; CHECK-NEXT: vmov r4, r5, d7 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: add.w r12, r12, #2 +; CHECK-NEXT: vmov r0, r2, d5 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r0 +; CHECK-NEXT: adc r2, r2, #0 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vmov r4, r5, d5 ; CHECK-NEXT: subs r6, r4, r6 +; CHECK-NEXT: eor.w r0, r0, r4 ; CHECK-NEXT: sbcs r5, r7 -; CHECK-NEXT: vmov r6, r7, d6 ; CHECK-NEXT: cset r5, lo ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: subs r0, r6, r0 -; CHECK-NEXT: sbcs.w r0, r7, r3 -; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cset r5, ne +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: ands r0, r5 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: @ implicit-def: $q2 +; CHECK-NEXT: subs r5, r0, r5 +; CHECK-NEXT: sbcs r4, r6 +; CHECK-NEXT: cset r4, lo +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: cset r4, ne +; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q3[2], q3[0], r0, r5 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r5 -; CHECK-NEXT: eor.w r0, r4, r2 -; CHECK-NEXT: orrs.w r0, r0, r12 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: teq.w r6, r9 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 -; CHECK-NEXT: veor q4, q4, q2 -; CHECK-NEXT: vand q4, q4, q3 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: and r2, r2, #1 -; CHECK-NEXT: orr.w r3, r2, r0, lsl #1 -; CHECK-NEXT: sub.w r2, r1, #8 -; CHECK-NEXT: lsls r0, r3, #31 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: ands r0, r4 +; CHECK-NEXT: sub.w r4, r1, #8 +; CHECK-NEXT: rsbs r5, r0, #0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: bfi r0, r5, #0, #1 +; CHECK-NEXT: bfi r0, r2, #1, #1 +; CHECK-NEXT: lsls r3, r0, #31 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [r2] -; CHECK-NEXT: vmovne.32 q3[0], r0 -; CHECK-NEXT: and r0, r3, #3 +; CHECK-NEXT: ldrne r3, [r4] +; CHECK-NEXT: vmovne.32 q2[0], r3 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: lsls r0, r0, #30 +; CHECK-NEXT: bfi r3, r5, #0, #8 ; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrmi r0, [r2, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r0 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: and r2, r2, #1 -; CHECK-NEXT: orr.w r2, r2, r0, lsl #1 -; CHECK-NEXT: lsls r0, r2, #31 +; CHECK-NEXT: ldrmi r0, [r4, #4] +; CHECK-NEXT: vmovmi.32 q2[2], r0 +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: and r0, r3, #1 +; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: bfi r0, r2, #0, #1 +; CHECK-NEXT: ubfx r2, r3, #8, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r0, r2, #1, #1 +; CHECK-NEXT: lsls r2, r0, #31 ; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r0, s12 -; CHECK-NEXT: strne r0, [r1] -; CHECK-NEXT: and r0, r2, #3 +; CHECK-NEXT: vmovne r2, s8 +; CHECK-NEXT: strne r2, [r1] ; CHECK-NEXT: lsls r0, r0, #30 ; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r0, s14 +; CHECK-NEXT: vmovmi r0, s10 ; CHECK-NEXT: strmi r0, [r1, #4] ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %cmp9.not = icmp eq i8 %m, 0 br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll --- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll @@ -11,52 +11,52 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: mvn r2, #-2147483648 +; CHECK-NEXT: mvn r12, #-2147483648 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: subs r3, r4, r2 +; CHECK-NEXT: subs.w r3, r4, r12 ; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 ; CHECK-NEXT: cset r3, lt -; CHECK-NEXT: vmov q1[3], q1[1], r1, r5 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mov.w r5, #-1 +; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: subs.w r0, r0, r12 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: adr r4, .LCPI0_1 +; CHECK-NEXT: bfi r5, r3, #0, #8 ; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: mov.w r12, #-1 ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vbic q0, q0, q2 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: adr r4, .LCPI0_1 +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r3, r5, d1 ; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 -; CHECK-NEXT: sbcs.w r0, r5, r1 +; CHECK-NEXT: sbcs.w r0, r12, r1 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs.w r1, r2, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r5, r3 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r2, r0, #0, #8 +; CHECK-NEXT: rsbs.w r0, r3, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r12, r5 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 @@ -89,30 +89,30 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: subs.w r2, r4, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 -; CHECK-NEXT: sbcs r2, r5, #0 +; CHECK-NEXT: subs.w r3, r4, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: cset r3, lo ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: cset r2, lo -; CHECK-NEXT: vmov q1[3], q1[1], r1, r5 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r1 +; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r2, r3, #0, #8 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vbic q0, q0, q2 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -131,46 +131,48 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: subs.w r2, r4, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 -; CHECK-NEXT: sbcs r2, r5, #0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r5 -; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: subs.w r3, r4, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r1 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r5, r3, #0, #8 ; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vbic q0, q0, q2 -; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r3, r5, d1 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: sbcs.w r0, r5, r1 +; CHECK-NEXT: sbcs.w r0, r2, r1 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: sbcs.w r1, r5, r3 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: rsbs r1, r3, #0 +; CHECK-NEXT: sbcs.w r1, r2, r5 +; CHECK-NEXT: bfi r2, r0, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -201,60 +203,66 @@ define arm_aapcs_vfpcc <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-LABEL: utest_f32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r6, d9 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: vmov r5, r0, d8 -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmov r0, r4, d8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q6, q0, q5 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: subs.w r2, r7, #-1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r7 -; CHECK-NEXT: sbcs r2, r4, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r10, r6 +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 ; CHECK-NEXT: cset r2, lo ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r7, r2, #0, #8 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r6, #-1 -; CHECK-NEXT: sbcs r1, r9, #0 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs.w r3, r10, #-1 -; CHECK-NEXT: sbcs r3, r8, #0 -; CHECK-NEXT: cset r3, lo -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vorn q1, q1, q2 -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorn q0, q0, q2 +; CHECK-NEXT: bfi r7, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r7 +; CHECK-NEXT: vpsel q0, q0, q5 ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: vmov.f32 s2, s24 +; CHECK-NEXT: vmov.f32 s3, s26 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %conv = fptoui <4 x float> %x to <4 x i64> %0 = icmp ult <4 x i64> %conv, @@ -266,96 +274,101 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r4, r0, d9 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vmov r4, r2, d8 -; CHECK-NEXT: subs.w r3, r5, #-1 -; CHECK-NEXT: sbcs r3, r6, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 -; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r6 -; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 +; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov.i32 q6, #0x0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r3 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q1, q5, q1 -; CHECK-NEXT: vorr q4, q0, q1 -; CHECK-NEXT: vmov r9, r8, d9 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: subs.w r0, r0, #-1 -; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r7, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: sbcs.w r0, r7, r3 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmov r0, r4, d8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q7, q0, q6 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: csetm r7, ne +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: subs.w r3, r0, #-1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r6 +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: vmov r1, r3, d8 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs.w r6, r9, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r7 -; CHECK-NEXT: sbcs.w r6, r2, r8 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r7 -; CHECK-NEXT: cset r6, lt -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q1, q5, q1 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: vmov r0, r7, d1 -; CHECK-NEXT: csetm r6, ne -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: sbcs.w r1, r2, r3 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r6 -; CHECK-NEXT: vand q2, q4, q2 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: sbcs.w r0, r2, r7 +; CHECK-NEXT: sbcs.w r0, r7, r1 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs r3, r5, #0 -; CHECK-NEXT: sbcs r2, r4 -; CHECK-NEXT: cset r2, lt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: sbcs.w r1, r7, r3 +; CHECK-NEXT: bfi r7, r0, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r7, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r7 +; CHECK-NEXT: vpsel q0, q0, q6 ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov.f32 s2, s28 +; CHECK-NEXT: vmov.f32 s3, s30 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %conv = fptosi <4 x float> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -433,60 +446,63 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-LABEL: ustest_f16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r6, r1 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 ; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q6, q0, q5 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r3, r5, #0 -; CHECK-NEXT: mov.w r2, #0 -; CHECK-NEXT: sbcs.w r3, r2, r7 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 -; CHECK-NEXT: cset r3, lt -; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: rsbs r7, r4, #0 -; CHECK-NEXT: sbcs.w r7, r2, r8 -; CHECK-NEXT: cset r7, lt -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: sbcs.w r6, r2, r9 -; CHECK-NEXT: vmov q3[2], q3[0], r7, r3 -; CHECK-NEXT: cset r6, lt -; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csetm r6, ne +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: bl __fixhfdi +; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: sbcs.w r0, r2, r1 +; CHECK-NEXT: sbcs.w r0, r6, r1 +; CHECK-NEXT: bfi r6, r2, #0, #8 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r6 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: bfi r6, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: vpsel q0, q0, q5 ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: vmov.f32 s2, s24 +; CHECK-NEXT: vmov.f32 s3, s26 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %conv = fptosi <4 x half> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -516,47 +532,47 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: movw r4, #32767 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: adr r5, .LCPI9_0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adr.w r12, .LCPI9_0 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: movw lr, #32768 +; CHECK-NEXT: movt lr, #65535 ; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: adr r5, .LCPI9_1 -; CHECK-NEXT: subs r0, r0, r4 -; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r2, r4 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: movw r4, #32768 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: subs r1, r1, r4 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: movt r4, #65535 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs.w r0, r12, r1 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r4, r2 -; CHECK-NEXT: sbcs.w r1, r12, r3 +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: subs r1, r3, r4 +; CHECK-NEXT: sbcs r1, r5, #0 +; CHECK-NEXT: adr r4, .LCPI9_1 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: subs.w r1, lr, r1 +; CHECK-NEXT: sbcs.w r1, r12, r2 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: subs.w r1, lr, r3 +; CHECK-NEXT: sbcs.w r1, r12, r5 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 @@ -596,26 +612,26 @@ ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: movw r4, #65535 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: movw r5, #65535 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: subs r0, r0, r5 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: subs r0, r0, r4 ; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r2, r5 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r5, r0, #0, #8 +; CHECK-NEXT: subs r0, r2, r4 +; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -641,42 +657,44 @@ ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: movw r4, #65535 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: movw r5, #65535 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: subs r0, r0, r5 -; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r2, r5 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: subs r1, r1, r4 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: sbcs.w r0, r5, r1 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: sbcs.w r1, r5, r3 +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: subs r1, r3, r4 +; CHECK-NEXT: sbcs r1, r5, #0 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: sbcs.w r1, r0, r2 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: rsbs r2, r3, #0 +; CHECK-NEXT: sbcs.w r2, r0, r5 +; CHECK-NEXT: bfi r0, r1, #0, #8 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -1357,68 +1375,71 @@ define arm_aapcs_vfpcc <2 x i32> @stest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: stest_f64i32_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov.w r4, #-2147483648 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: mvn r5, #-2147483648 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r5, ne -; CHECK-NEXT: cmp r0, r5 -; CHECK-NEXT: csel r0, r0, r5, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, eq +; CHECK-NEXT: adr r3, .LCPI27_0 +; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: subs.w r3, r4, r12 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: vmov q1[3], q1[1], r5, r1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r6, r1, r8, mi -; CHECK-NEXT: cmp.w r6, #-1 -; CHECK-NEXT: cset r1, gt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r2, r0, r4, ne -; CHECK-NEXT: cmp.w r0, #-2147483648 -; CHECK-NEXT: csel r3, r0, r4, hi -; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: vmov r0, r1, d8 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r7, r3, r2, ne -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r5, ne -; CHECK-NEXT: cmp r0, r5 -; CHECK-NEXT: csel r0, r0, r5, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r1, r8, mi -; CHECK-NEXT: cmp.w r1, #-1 -; CHECK-NEXT: cset r2, gt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r4, ne -; CHECK-NEXT: cmp.w r0, #-2147483648 -; CHECK-NEXT: csel r0, r0, r4, hi -; CHECK-NEXT: adds r3, r1, #1 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: orr.w r1, r1, r1, asr #31 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: orr.w r2, r6, r6, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r7 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r2 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: subs.w r0, r0, r12 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r5, r3, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: mov.w r12, #-1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: adr r4, .LCPI27_1 +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r12, r1 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #0, #8 +; CHECK-NEXT: rsbs.w r0, r3, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r12, r5 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI27_0: +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .LCPI27_1: +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff entry: %conv = fptosi <2 x double> %x to <2 x i64> %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> ) @@ -1435,32 +1456,32 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r2, r3, d8 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: mov.w r4, #-1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vldr s17, .LCPI28_0 -; CHECK-NEXT: csel r5, r0, r4, ne -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: vmov.f32 s19, s17 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r0, r0, r4, ne -; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: subs.w r3, r4, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: cset r3, lo +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r1 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r2, r3, #0, #8 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI28_0: -; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %conv = fptoui <2 x double> %x to <2 x i64> %spec.store.select = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %conv, <2 x i64> ) @@ -1471,66 +1492,55 @@ define arm_aapcs_vfpcc <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: vmov r2, r12, d8 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, mi -; CHECK-NEXT: mov.w r5, #-1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vldr s17, .LCPI29_0 -; CHECK-NEXT: csel r3, r0, r5, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: csel r0, r0, r3, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r1, r4, mi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, gt -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r3, r0, r3, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r6, r0, r3, ne -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov s18, r6 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: vmov.f32 s19, s17 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r5, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, eq +; CHECK-NEXT: subs.w r3, r4, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r1 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r1, r4, mi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, gt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r5, r3, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r2, r1 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: rsbs r1, r3, #0 +; CHECK-NEXT: sbcs.w r1, r2, r5 +; CHECK-NEXT: bfi r2, r0, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI29_0: -; CHECK-NEXT: .long 0x00000000 @ float 0 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %conv = fptosi <2 x double> %x to <2 x i64> %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> ) @@ -1555,41 +1565,66 @@ define arm_aapcs_vfpcc <4 x i32> @utest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: utest_f32i32_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vmov r2, r5, d8 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: mov.w r6, #-1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r8, r0, r6, ne -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r7, r0, r6, ne +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r4, r0, r6, ne -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmov r0, r4, d8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q6, q0, q5 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r7, r8 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r0, r0, r6, ne -; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r7, r2, #0, #8 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r7, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r7 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s2, s24 +; CHECK-NEXT: vmov.f32 s3, s26 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %conv = fptoui <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -1600,100 +1635,101 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: mov.w r9, #-1 ; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: cset r2, mi +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r9, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r0, r1, r7, mi +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r1, gt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r2, r1, ne +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r7, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov r0, r5, d8 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r8, r2, r1, ne +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: sbcs.w r0, r7, r3 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmov r0, r4, d8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q7, q0, q6 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r9, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r1, r7, mi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, gt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r6, r0, r2, ne +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r9, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r1, r7, mi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, gt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r4, r0, r2, ne -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r8 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r9, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r1, r7, mi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, gt +; CHECK-NEXT: subs.w r2, r5, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: vmov q0[3], q0[1], r6, r1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r7, r1 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: sbcs.w r1, r7, r3 +; CHECK-NEXT: bfi r7, r0, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r7, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r7 +; CHECK-NEXT: vpsel q0, q0, q6 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s2, s28 +; CHECK-NEXT: vmov.f32 s3, s30 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %conv = fptosi <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -1768,52 +1804,60 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, gt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r4, r0, r2, ne -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, gt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r5, r0, r2, ne +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r2, gt +; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r6, r0, r2, ne +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r6, r1 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q6, q0, q5 +; CHECK-NEXT: bl __fixhfdi +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: cset r2, gt +; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r0, r2, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: vmov q0[3], q0[1], r0, r6 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r6, r1 +; CHECK-NEXT: bfi r6, r2, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r6, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s2, s24 +; CHECK-NEXT: vmov.f32 s3, s26 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %conv = fptosi <4 x half> %x to <4 x i64> @@ -1828,8 +1872,8 @@ define arm_aapcs_vfpcc <2 x i16> @stest_f64i16_mm(<2 x double> %x) { ; CHECK-LABEL: stest_f64i16_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 @@ -1840,63 +1884,63 @@ ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: movw r2, #32767 +; CHECK-NEXT: movw r4, #32767 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: adr.w r12, .LCPI36_0 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vmov r3, r5, d1 ; CHECK-NEXT: movw lr, #32768 -; CHECK-NEXT: vmov r0, r6, d0 ; CHECK-NEXT: movt lr, #65535 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r5, mi -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r5, r1, r2, ne -; CHECK-NEXT: cmp r1, r2 -; CHECK-NEXT: csel r1, r1, r2, lo -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r5, r1, r5, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r1, r3, r12, mi -; CHECK-NEXT: cmp.w r1, #-1 -; CHECK-NEXT: cset r3, gt -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r3, r5, lr, ne -; CHECK-NEXT: cmn.w r5, #32768 -; CHECK-NEXT: csel r5, r5, lr, hi -; CHECK-NEXT: adds r4, r1, #1 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: orr.w r1, r1, r1, asr #31 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r3, r5, r3, ne -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: cset r4, mi -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r4, r0, r2, ne -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: csel r0, r0, r2, lo -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: cset r2, eq -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r0, r0, r4, ne -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r2, r6, r12, mi -; CHECK-NEXT: cmp.w r2, #-1 -; CHECK-NEXT: cset r6, gt -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r0, lr, ne -; CHECK-NEXT: cmn.w r0, #32768 -; CHECK-NEXT: csel r0, r0, lr, hi -; CHECK-NEXT: adds r5, r2, #1 -; CHECK-NEXT: cset r5, eq -; CHECK-NEXT: orr.w r2, r2, r2, asr #31 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r0, r0, r6, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: mov.w r12, #-1 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: subs r1, r1, r4 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: subs r1, r3, r4 +; CHECK-NEXT: sbcs r1, r5, #0 +; CHECK-NEXT: adr r4, .LCPI36_1 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: subs.w r1, lr, r1 +; CHECK-NEXT: sbcs.w r1, r12, r2 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: subs.w r1, lr, r3 +; CHECK-NEXT: sbcs.w r1, r12, r5 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI36_0: +; CHECK-NEXT: .long 32767 @ 0x7fff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 32767 @ 0x7fff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .LCPI36_1: +; CHECK-NEXT: .long 4294934528 @ 0xffff8000 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 4294934528 @ 0xffff8000 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) @@ -1920,24 +1964,26 @@ ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: movw r4, #65535 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: movw r5, #65535 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: cmp r0, r5 -; CHECK-NEXT: csel r0, r0, r5, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r0, r0, r5, ne -; CHECK-NEXT: cmp r2, r5 -; CHECK-NEXT: csel r2, r2, r5, lo -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r2, r2, r5, ne -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r5, r0, #0, #8 +; CHECK-NEXT: subs r0, r2, r4 +; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -1964,59 +2010,44 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: movw r4, #65535 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r1, r0, d0 -; CHECK-NEXT: vldr s1, .LCPI38_0 -; CHECK-NEXT: vmov.f32 s3, s1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r5, mi -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r12, r2, r4, ne -; CHECK-NEXT: cmp r2, r4 -; CHECK-NEXT: csel r2, r2, r4, lo -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r5, eq -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r2, r2, r12, ne -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r3, r3, r12, mi -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r5, gt -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r5, r2, r5, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r2, r2, r5, ne -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r3, mi -; CHECK-NEXT: vmov s2, r2 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r3, r1, r4, ne -; CHECK-NEXT: cmp r1, r4 -; CHECK-NEXT: csel r1, r1, r4, lo -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r5, eq -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r1, r1, r3, ne -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel r0, r0, r12, mi -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r3, gt -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r3, r1, r3, ne -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel r0, r1, r3, ne -; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: subs r1, r1, r4 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: subs r1, r3, r4 +; CHECK-NEXT: sbcs r1, r5, #0 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: sbcs.w r1, r0, r2 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: rsbs r2, r3, #0 +; CHECK-NEXT: sbcs.w r2, r0, r5 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI38_0: -; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll --- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll @@ -2668,137 +2668,82 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI32_0 -; CHECK-NEXT: vmov r9, r8, d9 -; CHECK-NEXT: vmov r11, r10, d0 -; CHECK-NEXT: str.w r11, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: str.w r10, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: vmov r8, r7, d8 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: strd r2, r3, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI32_1 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: vmov r5, r3, d0 -; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str r5, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r0, #-1 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: vmov r7, r6, d8 -; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: strd r2, r3, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: bl __aeabi_d2iz ; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r11, #-1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: vmov r6, r5, d9 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: ldrd r2, r3, [sp, #20] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r5, #-1 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: ldrd r2, r3, [sp, #20] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: and r0, r11, #1 +; CHECK-NEXT: ldrd r2, r3, [sp, #12] @ 8-byte Folded Reload +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: bfi r4, r0, #0, #1 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: ldrd r2, r3, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_d2iz +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r10, #-1 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: moveq.w r7, #-1 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r11, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r10, r5 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: and r0, r7, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r4, r0, #1, #1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: strb r4, [r0] +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll --- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll @@ -2141,103 +2141,66 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI32_0 -; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: vmov r5, r6, d8 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: vmov r10, r9, d0 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI32_1 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: vmov r2, r11, d0 -; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: str.w r11, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: vmov r4, r11, d0 +; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r7, r6, d8 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: bl __aeabi_d2uiz +; CHECK-NEXT: vmov r6, r5, d9 ; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: csel r0, r0, r8, ne -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 -; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: str.w r10, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: mov r8, r9 -; CHECK-NEXT: str.w r9, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: csel r9, r0, r9, ne -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: and r0, r0, #1 ; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r9, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: bfi r7, r0, #0, #1 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: ldr.w r10, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel r5, r1, r0, ne -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel r0, r1, r0, ne +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_d2uiz ; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csel r0, r0, r4, ne +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r9, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r5 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r7, r0, #1, #1 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll @@ -6,10 +6,12 @@ declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <4 x i1> @llvm.arm.mve.vctp64(i32) +declare i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) @@ -209,6 +211,32 @@ ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x double> @test_vpselq_f64(<2 x double> %a, <2 x double> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = select <2 x i1> %1, <2 x double> %a, <2 x double> %b + ret <2 x double> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vpselq_i64_2(<2 x i64> %a, <2 x i64> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i64_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -330,116 +330,120 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_ops_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov.i64 q3, #0xffffffff -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s10, s5 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r0, d4 +; CHECK-NEXT: vmov.f32 s18, s1 +; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov r11, r2, d5 -; CHECK-NEXT: vmov r9, lr, d3 -; CHECK-NEXT: vmov r10, s12 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r8, s2 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: adds.w r4, r1, r11 -; CHECK-NEXT: asr.w r0, r1, #31 -; CHECK-NEXT: adc.w r3, r0, r2 -; CHECK-NEXT: asrl r4, r3, r11 -; CHECK-NEXT: subs.w r0, r4, r11 -; CHECK-NEXT: sbc.w r2, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r11 -; CHECK-NEXT: adds.w r6, r8, r9 -; CHECK-NEXT: mla r7, r2, r11, r4 -; CHECK-NEXT: asr.w r3, r8, #31 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: rsbs r2, r1, #0 -; CHECK-NEXT: asrl r6, r3, r9 -; CHECK-NEXT: lsll r0, r7, r2 -; CHECK-NEXT: subs.w r5, r6, r9 -; CHECK-NEXT: vmov r6, r2, d4 -; CHECK-NEXT: lsll r0, r7, r11 -; CHECK-NEXT: sbc.w r3, r3, lr -; CHECK-NEXT: asr.w r7, r10, #31 -; CHECK-NEXT: adds.w r4, r10, r6 -; CHECK-NEXT: adcs r7, r2 -; CHECK-NEXT: asrl r4, r7, r6 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: sbcs r7, r2 -; CHECK-NEXT: umull r2, r4, r4, r6 -; CHECK-NEXT: mla r7, r7, r6, r4 -; CHECK-NEXT: rsb.w r4, r10, #0 -; CHECK-NEXT: lsll r2, r7, r4 -; CHECK-NEXT: lsll r2, r7, r6 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 -; CHECK-NEXT: umull r12, r0, r5, r9 -; CHECK-NEXT: mul r7, r5, lr -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: orrs r0, r7 -; CHECK-NEXT: mla r7, r3, r9, r0 -; CHECK-NEXT: vmov r3, r4, d2 -; CHECK-NEXT: asr.w lr, r5, #31 -; CHECK-NEXT: adds r0, r5, r3 -; CHECK-NEXT: adc.w r1, lr, r4 -; CHECK-NEXT: asrl r0, r1, r3 -; CHECK-NEXT: subs r0, r0, r3 -; CHECK-NEXT: sbc.w lr, r1, r4 -; CHECK-NEXT: umull r2, r1, r0, r3 -; CHECK-NEXT: muls r0, r4, r0 -; CHECK-NEXT: eor.w r4, r5, r3 -; CHECK-NEXT: orr.w r4, r4, r5, asr #31 -; CHECK-NEXT: orrs r1, r0 -; CHECK-NEXT: eor.w r0, r8, r9 -; CHECK-NEXT: orr.w r0, r0, r8, asr #31 -; CHECK-NEXT: mla r1, lr, r3, r1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, eq -; CHECK-NEXT: vmov.32 q0[1], r4 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vbic q4, q1, q0 -; CHECK-NEXT: eor.w r0, r4, r11 -; CHECK-NEXT: orr.w r0, r0, r4, asr #31 -; CHECK-NEXT: eor.w r4, r10, r6 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: orr.w r4, r4, r10, asr #31 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, eq -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r0 -; CHECK-NEXT: rsb.w r0, r8, #0 -; CHECK-NEXT: lsll r12, r7, r0 -; CHECK-NEXT: rsbs r0, r5, #0 -; CHECK-NEXT: lsll r2, r1, r0 -; CHECK-NEXT: vbic q1, q2, q5 -; CHECK-NEXT: vand q2, q3, q5 -; CHECK-NEXT: lsll r12, r7, r9 -; CHECK-NEXT: lsll r2, r1, r3 -; CHECK-NEXT: vorr q1, q2, q1 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r12 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q4 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: vmov r9, s0 +; CHECK-NEXT: adds r4, r3, r1 +; CHECK-NEXT: asr.w r6, r3, #31 +; CHECK-NEXT: adc.w r5, r6, r0 +; CHECK-NEXT: asrl r4, r5, r1 +; CHECK-NEXT: subs r6, r4, r1 +; CHECK-NEXT: sbc.w r8, r5, r0 +; CHECK-NEXT: umull r10, lr, r6, r1 +; CHECK-NEXT: muls r6, r0, r6 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: orr.w lr, lr, r6 +; CHECK-NEXT: adds r6, r0, r2 +; CHECK-NEXT: asr.w r5, r0, #31 +; CHECK-NEXT: adc.w r7, r5, r12 +; CHECK-NEXT: asrl r6, r7, r2 +; CHECK-NEXT: mla r5, r8, r1, lr +; CHECK-NEXT: subs r4, r6, r2 +; CHECK-NEXT: sbc.w lr, r7, r12 +; CHECK-NEXT: umull r6, r7, r4, r2 +; CHECK-NEXT: mul r4, r4, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: orr.w r8, r7, r4 +; CHECK-NEXT: eor.w r7, r3, r1 +; CHECK-NEXT: orr.w r7, r7, r3, asr #31 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cset r7, eq +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: csetm r7, ne +; CHECK-NEXT: bfi r4, r7, #0, #8 +; CHECK-NEXT: eor.w r7, r0, r2 +; CHECK-NEXT: orr.w r7, r7, r0, asr #31 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cset r7, eq +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: csetm r7, ne +; CHECK-NEXT: bfi r4, r7, #8, #8 +; CHECK-NEXT: asr.w r7, r9, #31 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: rsbs r4, r3, #0 +; CHECK-NEXT: mla r3, lr, r2, r8 +; CHECK-NEXT: lsll r10, r5, r4 +; CHECK-NEXT: lsll r10, r5, r1 +; CHECK-NEXT: lsll r6, r3, r0 +; CHECK-NEXT: lsll r6, r3, r2 +; CHECK-NEXT: vmov q4[2], q4[0], r10, r6 +; CHECK-NEXT: vmov q4[3], q4[1], r5, r3 +; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: vmov r0, r5, d3 +; CHECK-NEXT: vpsel q2, q4, q2 +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: adds.w r6, r9, r1 +; CHECK-NEXT: adcs r7, r3 +; CHECK-NEXT: asrl r6, r7, r1 +; CHECK-NEXT: subs.w r8, r6, r1 +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: sbc.w lr, r7, r3 +; CHECK-NEXT: umull r2, r7, r8, r1 +; CHECK-NEXT: adds r4, r6, r0 +; CHECK-NEXT: asr.w r3, r6, #31 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: asrl r4, r3, r0 +; CHECK-NEXT: subs r4, r4, r0 +; CHECK-NEXT: sbcs r3, r5 +; CHECK-NEXT: umull r4, r5, r4, r0 +; CHECK-NEXT: mla r3, r3, r0, r5 +; CHECK-NEXT: eor.w r5, r9, r1 +; CHECK-NEXT: orr.w r5, r5, r9, asr #31 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cset r5, eq +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: bfi r12, r5, #0, #8 +; CHECK-NEXT: eor.w r5, r6, r0 +; CHECK-NEXT: orr.w r5, r5, r6, asr #31 +; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: lsll r4, r3, r6 +; CHECK-NEXT: cset r5, eq +; CHECK-NEXT: lsll r4, r3, r0 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: rsb.w r0, r9, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: bfi r12, r5, #8, #8 +; CHECK-NEXT: mla r5, lr, r1, r7 +; CHECK-NEXT: vmsr p0, r12 +; CHECK-NEXT: lsll r2, r5, r0 +; CHECK-NEXT: lsll r2, r5, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -93,55 +93,57 @@ ; CHECK-LE-NEXT: push {r4, r5, r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r1, [r1] -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r12, r1, asr #31 -; CHECK-LE-NEXT: cset r3, lt -; CHECK-LE-NEXT: @ implicit-def: $q1 -; CHECK-LE-NEXT: vmov q0[2], q0[0], lr, r1 -; CHECK-LE-NEXT: cmp r3, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 +; CHECK-LE-NEXT: ldrd r12, lr, [r1] +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: rsbs.w r1, r12, #0 +; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr +; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne ; CHECK-LE-NEXT: rsbs.w r4, lr, #0 -; CHECK-LE-NEXT: sbcs.w r4, r12, lr, asr #31 -; CHECK-LE-NEXT: mov.w lr, #0 -; CHECK-LE-NEXT: cset r4, lt -; CHECK-LE-NEXT: bfi r3, r4, #0, #1 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r3, r3, #31 +; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: bfi r3, r1, #1, #1 +; CHECK-LE-NEXT: lsls r1, r3, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: ldrne r3, [r2] -; CHECK-LE-NEXT: vmovne.32 q1[0], r3 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q0[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] -; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 -; CHECK-LE-NEXT: vmov r1, s6 +; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 ; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: vmov r3, s4 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-LE-NEXT: rsbs r5, r2, #0 -; CHECK-LE-NEXT: asr.w r12, r1, #31 -; CHECK-LE-NEXT: sbcs.w r1, lr, r2, asr #31 -; CHECK-LE-NEXT: vmov r1, s0 +; CHECK-LE-NEXT: vmov r4, s0 +; CHECK-LE-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-LE-NEXT: rsbs r5, r3, #0 +; CHECK-LE-NEXT: asr.w r12, r2, #31 +; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31 +; CHECK-LE-NEXT: vmov r3, s6 ; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: asr.w lr, r4, #31 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: asr.w r4, r3, #31 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: vmov q1[3], q1[1], r4, r12 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r1, asr #31 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: rsbs r5, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: vstrne d2, [r0] +; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi -; CHECK-LE-NEXT: vstrmi d3, [r0, #8] +; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r4, r5, r7, pc} ; @@ -151,22 +153,23 @@ ; CHECK-BE-NEXT: push {r4, r5, r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r3, lr, [r1] -; CHECK-BE-NEXT: mov.w r12, #0 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-BE-NEXT: cset r3, lt +; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr +; CHECK-BE-NEXT: cmp r3, #0 ; CHECK-BE-NEXT: @ implicit-def: $q2 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: vmov q0[3], q0[1], r3, lr -; CHECK-BE-NEXT: sbcs.w r1, r12, r3, asr #31 +; CHECK-BE-NEXT: csetm lr, ne +; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-BE-NEXT: bfi r1, lr, #0, #1 ; CHECK-BE-NEXT: cset r3, lt ; CHECK-BE-NEXT: cmp r3, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r3, #1 -; CHECK-BE-NEXT: rsbs.w r1, lr, #0 -; CHECK-BE-NEXT: sbcs.w r1, r12, lr, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r3, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r3, #3 -; CHECK-BE-NEXT: lsls r3, r3, #30 +; CHECK-BE-NEXT: csetm r3, ne +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB5_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: ldr r3, [r2] @@ -184,28 +187,29 @@ ; CHECK-BE-NEXT: .LBB5_4: @ %else2 ; CHECK-BE-NEXT: vrev64.32 q0, q2 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: vmov r1, s3 -; CHECK-BE-NEXT: movs r4, #0 +; CHECK-BE-NEXT: vmov r2, s3 +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vmov r3, s1 -; CHECK-BE-NEXT: vmov r2, s9 -; CHECK-BE-NEXT: asr.w r12, r1, #31 +; CHECK-BE-NEXT: vmov r4, s11 +; CHECK-BE-NEXT: asr.w r12, r2, #31 ; CHECK-BE-NEXT: asr.w lr, r3, #31 -; CHECK-BE-NEXT: rsbs r5, r2, #0 +; CHECK-BE-NEXT: rsbs r5, r4, #0 ; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12 -; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1 -; CHECK-BE-NEXT: vmov r1, s11 +; CHECK-BE-NEXT: sbcs.w r4, r1, r4, asr #31 +; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-BE-NEXT: vmov r3, s9 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: sbcs.w r1, r4, r1, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: rsbs r5, r3, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r3, asr #31 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 @@ -229,50 +233,52 @@ ; CHECK-LE-NEXT: push {r4, r5, r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r1, [r1] -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r12, r1, asr #31 -; CHECK-LE-NEXT: cset r3, lt +; CHECK-LE-NEXT: ldrd r12, lr, [r1] +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], lr, r1 -; CHECK-LE-NEXT: cmp r3, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 +; CHECK-LE-NEXT: rsbs.w r1, r12, #0 +; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr +; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne ; CHECK-LE-NEXT: rsbs.w r4, lr, #0 -; CHECK-LE-NEXT: sbcs.w r4, r12, lr, asr #31 -; CHECK-LE-NEXT: mov.w lr, #0 -; CHECK-LE-NEXT: cset r4, lt -; CHECK-LE-NEXT: bfi r3, r4, #0, #1 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r3, r3, #31 +; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: bfi r3, r1, #1, #1 +; CHECK-LE-NEXT: lsls r1, r3, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: ldrne r3, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r3 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q0[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] ; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r1, s2 -; CHECK-LE-NEXT: vmov r2, s6 -; CHECK-LE-NEXT: vmov r3, s0 -; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1 -; CHECK-LE-NEXT: rsbs r5, r2, #0 -; CHECK-LE-NEXT: asr.w r12, r1, #31 -; CHECK-LE-NEXT: sbcs.w r1, lr, r2, asr #31 -; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov r4, s0 +; CHECK-LE-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-LE-NEXT: rsbs r5, r3, #0 +; CHECK-LE-NEXT: asr.w r12, r2, #31 +; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31 +; CHECK-LE-NEXT: vmov r3, s6 ; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: asr.w lr, r4, #31 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: asr.w r4, r3, #31 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: vmov q0[3], q0[1], r4, r12 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r1, asr #31 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: rsbs r5, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, r3, d0 ; CHECK-LE-NEXT: strdne r2, r3, [r0] @@ -289,22 +295,23 @@ ; CHECK-BE-NEXT: push {r4, r5, r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r3, lr, [r1] -; CHECK-BE-NEXT: mov.w r12, #0 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-BE-NEXT: cset r3, lt +; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr +; CHECK-BE-NEXT: cmp r3, #0 ; CHECK-BE-NEXT: @ implicit-def: $q2 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: vmov q0[3], q0[1], r3, lr -; CHECK-BE-NEXT: sbcs.w r1, r12, r3, asr #31 +; CHECK-BE-NEXT: csetm lr, ne +; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-BE-NEXT: bfi r1, lr, #0, #1 ; CHECK-BE-NEXT: cset r3, lt ; CHECK-BE-NEXT: cmp r3, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r3, #1 -; CHECK-BE-NEXT: rsbs.w r1, lr, #0 -; CHECK-BE-NEXT: sbcs.w r1, r12, lr, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r3, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r3, #3 -; CHECK-BE-NEXT: lsls r3, r3, #30 +; CHECK-BE-NEXT: csetm r3, ne +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB6_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: ldr r3, [r2] @@ -322,28 +329,29 @@ ; CHECK-BE-NEXT: .LBB6_4: @ %else2 ; CHECK-BE-NEXT: vrev64.32 q0, q2 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: vmov r1, s3 -; CHECK-BE-NEXT: movs r4, #0 +; CHECK-BE-NEXT: vmov r2, s3 +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vmov r3, s1 -; CHECK-BE-NEXT: vmov r2, s9 -; CHECK-BE-NEXT: asr.w r12, r1, #31 +; CHECK-BE-NEXT: vmov r4, s11 +; CHECK-BE-NEXT: asr.w r12, r2, #31 ; CHECK-BE-NEXT: asr.w lr, r3, #31 -; CHECK-BE-NEXT: rsbs r5, r2, #0 +; CHECK-BE-NEXT: rsbs r5, r4, #0 ; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12 -; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1 -; CHECK-BE-NEXT: vmov r1, s11 +; CHECK-BE-NEXT: sbcs.w r4, r1, r4, asr #31 +; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-BE-NEXT: vmov r3, s9 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: sbcs.w r1, r4, r1, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: rsbs r5, r3, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r3, asr #31 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, r3, d0 ; CHECK-BE-NEXT: strdmi r3, r2, [r0] @@ -369,46 +377,48 @@ ; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r1, [r1] -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r12, r1, asr #31 -; CHECK-LE-NEXT: cset r3, lt +; CHECK-LE-NEXT: ldrd r12, lr, [r1] +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], lr, r1 -; CHECK-LE-NEXT: cmp r3, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 -; CHECK-LE-NEXT: rsbs.w r4, lr, #0 ; CHECK-LE-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-LE-NEXT: sbcs.w r4, r12, lr, asr #31 -; CHECK-LE-NEXT: cset r4, lt -; CHECK-LE-NEXT: bfi r3, r4, #0, #1 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r3, r3, #31 +; CHECK-LE-NEXT: rsbs.w r1, r12, #0 +; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr +; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: rsbs.w r4, lr, #0 +; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: bfi r3, r1, #1, #1 +; CHECK-LE-NEXT: lsls r1, r3, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: ldrne r3, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r3 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q0[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] ; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r2, s6 +; CHECK-LE-NEXT: vmov r2, s4 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: vand q0, q0, q2 ; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov r3, s6 ; CHECK-LE-NEXT: sbcs.w r2, r1, r2, asr #31 ; CHECK-LE-NEXT: cset r2, lt ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 +; CHECK-LE-NEXT: csetm r2, ne ; CHECK-LE-NEXT: rsbs r4, r3, #0 -; CHECK-LE-NEXT: sbcs.w r1, r1, r3, asr #31 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 @@ -423,55 +433,57 @@ ; CHECK-BE-NEXT: push {r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r3, lr, [r1] -; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, lr -; CHECK-BE-NEXT: sbcs.w r1, r12, r3, asr #31 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 ; CHECK-BE-NEXT: cset r3, lt +; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr ; CHECK-BE-NEXT: cmp r3, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r3, #1 -; CHECK-BE-NEXT: rsbs.w r1, lr, #0 -; CHECK-BE-NEXT: sbcs.w r1, r12, lr, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r3, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r3, #3 -; CHECK-BE-NEXT: lsls r3, r3, #30 +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: csetm lr, ne +; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-BE-NEXT: bfi r1, lr, #0, #1 +; CHECK-BE-NEXT: cset r3, lt +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: csetm r3, ne +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB7_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: ldr r3, [r2] ; CHECK-BE-NEXT: vmov.32 q2[1], r3 -; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: vrev64.32 q1, q2 ; CHECK-BE-NEXT: .LBB7_2: @ %else -; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB7_4 ; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1 ; CHECK-BE-NEXT: ldr r1, [r2, #4] -; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov.32 q1[3], r1 ; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: vmov.32 q0[3], r1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: .LBB7_4: @ %else2 ; CHECK-BE-NEXT: vrev64.32 q3, q2 -; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: vmov r2, s13 -; CHECK-BE-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-BE-NEXT: vand q0, q0, q1 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, s15 +; CHECK-BE-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-BE-NEXT: vand q0, q1, q0 ; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: vmov r3, s15 -; CHECK-BE-NEXT: sbcs.w r2, r12, r2, asr #31 +; CHECK-BE-NEXT: vmov r3, s13 +; CHECK-BE-NEXT: sbcs.w r2, r1, r2, asr #31 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: sbcs.w r1, r12, r3, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r12, ne +; CHECK-BE-NEXT: rsbs r2, r3, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, r3, asr #31 +; CHECK-BE-NEXT: bfi r1, r12, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 @@ -495,46 +507,48 @@ ; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r1, [r1] -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r12, r1, asr #31 -; CHECK-LE-NEXT: cset r3, lt +; CHECK-LE-NEXT: ldrd r12, lr, [r1] +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], lr, r1 -; CHECK-LE-NEXT: cmp r3, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 -; CHECK-LE-NEXT: rsbs.w r4, lr, #0 ; CHECK-LE-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-LE-NEXT: sbcs.w r4, r12, lr, asr #31 -; CHECK-LE-NEXT: cset r4, lt -; CHECK-LE-NEXT: bfi r3, r4, #0, #1 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r3, r3, #31 +; CHECK-LE-NEXT: rsbs.w r1, r12, #0 +; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr +; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: rsbs.w r4, lr, #0 +; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: cset r1, lt +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: bfi r3, r1, #1, #1 +; CHECK-LE-NEXT: lsls r1, r3, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: ldrne r3, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r3 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q0[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] ; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r2, s6 +; CHECK-LE-NEXT: vmov r2, s4 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: vand q0, q0, q2 ; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov r3, s6 ; CHECK-LE-NEXT: sbcs.w r2, r1, r2, asr #31 ; CHECK-LE-NEXT: cset r2, lt ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 +; CHECK-LE-NEXT: csetm r2, ne ; CHECK-LE-NEXT: rsbs r4, r3, #0 -; CHECK-LE-NEXT: sbcs.w r1, r1, r3, asr #31 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, r3, d0 ; CHECK-LE-NEXT: strdne r2, r3, [r0] @@ -551,55 +565,57 @@ ; CHECK-BE-NEXT: push {r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r3, lr, [r1] -; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, lr -; CHECK-BE-NEXT: sbcs.w r1, r12, r3, asr #31 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 ; CHECK-BE-NEXT: cset r3, lt +; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr ; CHECK-BE-NEXT: cmp r3, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r3, #1 -; CHECK-BE-NEXT: rsbs.w r1, lr, #0 -; CHECK-BE-NEXT: sbcs.w r1, r12, lr, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r3, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r3, #3 -; CHECK-BE-NEXT: lsls r3, r3, #30 +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: csetm lr, ne +; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-BE-NEXT: bfi r1, lr, #0, #1 +; CHECK-BE-NEXT: cset r3, lt +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: csetm r3, ne +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB8_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: ldr r3, [r2] ; CHECK-BE-NEXT: vmov.32 q2[1], r3 -; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: vrev64.32 q1, q2 ; CHECK-BE-NEXT: .LBB8_2: @ %else -; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB8_4 ; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1 ; CHECK-BE-NEXT: ldr r1, [r2, #4] -; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov.32 q1[3], r1 ; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: vmov.32 q0[3], r1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: .LBB8_4: @ %else2 ; CHECK-BE-NEXT: vrev64.32 q3, q2 -; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: vmov r2, s13 -; CHECK-BE-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-BE-NEXT: vand q0, q0, q1 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, s15 +; CHECK-BE-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-BE-NEXT: vand q0, q1, q0 ; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: vmov r3, s15 -; CHECK-BE-NEXT: sbcs.w r2, r12, r2, asr #31 +; CHECK-BE-NEXT: vmov r3, s13 +; CHECK-BE-NEXT: sbcs.w r2, r1, r2, asr #31 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: sbcs.w r1, r12, r3, asr #31 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r12, ne +; CHECK-BE-NEXT: rsbs r2, r3, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, r3, asr #31 +; CHECK-BE-NEXT: bfi r1, r12, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, r3, d0 ; CHECK-BE-NEXT: strdmi r3, r2, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -1754,21 +1754,22 @@ ; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r1, r2, d1 -; CHECK-LE-NEXT: mov.w lr, #0 -; CHECK-LE-NEXT: vmov r3, r12, d0 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r2 +; CHECK-LE-NEXT: vmov r2, r3, d0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov r12, lr, d1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r1, r3 ; CHECK-LE-NEXT: cset r2, lt ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: rsbs r1, r3, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r12 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, lr +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB49_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: vldr d1, .LCPI49_0 @@ -1795,21 +1796,22 @@ ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov.w lr, #0 -; CHECK-BE-NEXT: vmov r1, r2, d2 -; CHECK-BE-NEXT: vmov r12, r3, d3 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r1 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, r3, d3 +; CHECK-BE-NEXT: vmov r12, lr, d2 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, r2 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r12 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB49_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: vldr d1, .LCPI49_0 @@ -1841,21 +1843,22 @@ ; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r1, r2, d3 -; CHECK-LE-NEXT: mov.w lr, #0 -; CHECK-LE-NEXT: vmov r3, r12, d2 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r2 +; CHECK-LE-NEXT: vmov r2, r3, d2 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov r12, lr, d3 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r1, r3 ; CHECK-LE-NEXT: cset r2, lt ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: rsbs r1, r3, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r12 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, lr +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB50_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: vldr d1, .LCPI50_0 @@ -1882,21 +1885,22 @@ ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: mov.w lr, #0 -; CHECK-BE-NEXT: vmov r1, r2, d0 -; CHECK-BE-NEXT: vmov r12, r3, d1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r1 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, r3, d1 +; CHECK-BE-NEXT: vmov r12, lr, d0 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, r2 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r12 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB50_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: vldr d1, .LCPI50_0 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -939,21 +939,22 @@ ; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r1, r2, d1 -; CHECK-LE-NEXT: mov.w lr, #0 -; CHECK-LE-NEXT: vmov r3, r12, d0 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r2 +; CHECK-LE-NEXT: vmov r2, r3, d0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov r12, lr, d1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r1, r3 ; CHECK-LE-NEXT: cset r2, lt ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: rsbs r1, r3, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r12 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, lr +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 @@ -969,21 +970,22 @@ ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov.w lr, #0 -; CHECK-BE-NEXT: vmov r1, r2, d2 -; CHECK-BE-NEXT: vmov r12, r3, d3 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r1 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, r3, d3 +; CHECK-BE-NEXT: vmov r12, lr, d2 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, r2 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r12 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 @@ -1004,21 +1006,22 @@ ; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r1, r2, d3 -; CHECK-LE-NEXT: mov.w lr, #0 -; CHECK-LE-NEXT: vmov r3, r12, d2 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r2 +; CHECK-LE-NEXT: vmov r2, r3, d2 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov r12, lr, d3 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r1, r3 ; CHECK-LE-NEXT: cset r2, lt ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: rsbs r1, r3, #0 -; CHECK-LE-NEXT: sbcs.w r1, lr, r12 -; CHECK-LE-NEXT: cset r1, lt -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, lr +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: cset r2, lt +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: csetm r2, ne +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 @@ -1034,21 +1037,22 @@ ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: mov.w lr, #0 -; CHECK-BE-NEXT: vmov r1, r2, d4 -; CHECK-BE-NEXT: vmov r12, r3, d5 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r1 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, r3, d5 +; CHECK-BE-NEXT: vmov r12, lr, d4 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, r2 ; CHECK-BE-NEXT: cset r2, lt ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: rsbs r1, r3, #0 -; CHECK-BE-NEXT: sbcs.w r1, lr, r12 -; CHECK-BE-NEXT: cset r1, lt -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: sbcs.w r3, r1, r12 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: cset r2, lt +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: csetm r2, ne +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -40,25 +40,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r12, lr, d2 -; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d3 +; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r4, r12 -; CHECK-NEXT: sbcs.w r1, r5, lr -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: subs.w r0, r4, r12 +; CHECK-NEXT: sbcs.w r0, r5, lr +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = icmp slt <2 x i64> %s1, %s2 @@ -104,25 +104,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r12, lr, d2 -; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d3 +; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r4, r12 -; CHECK-NEXT: sbcs.w r1, r5, lr -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: subs.w r0, r4, r12 +; CHECK-NEXT: sbcs.w r0, r5, lr +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = icmp ult <2 x i64> %s1, %s2 @@ -169,25 +169,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vmov r12, lr, d0 -; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: subs.w r0, r4, r12 +; CHECK-NEXT: sbcs.w r0, r5, lr ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r4, r12 -; CHECK-NEXT: sbcs.w r1, r5, lr -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = icmp sgt <2 x i64> %s1, %s2 @@ -233,25 +233,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vmov r12, lr, d0 -; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r4, r12 -; CHECK-NEXT: sbcs.w r1, r5, lr -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: subs.w r0, r4, r12 +; CHECK-NEXT: sbcs.w r0, r5, lr +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = icmp ugt <2 x i64> %s1, %s2 @@ -323,16 +323,18 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vmov r12, r1, d8 +; CHECK-NEXT: vmov r12, r1, d9 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r4, r0, #0, #8 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 @@ -340,11 +342,9 @@ ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 -; CHECK-NEXT: vbic q1, q5, q0 -; CHECK-NEXT: vand q0, q4, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r4, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q0, q4, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll --- a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll @@ -141,45 +141,36 @@ define arm_aapcs_vfpcc <2 x i32> @smax2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: smax2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: asr.w r9, r0, #31 -; CHECK-NEXT: cmp.w r9, r1, asr #31 -; CHECK-NEXT: cset r3, gt -; CHECK-NEXT: asrs r5, r2, #31 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: asr.w r3, r1, #31 -; CHECK-NEXT: csel r12, r0, r1, ne -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel lr, r0, r1, hi -; CHECK-NEXT: cmp r9, r3 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: csel r12, lr, r12, ne -; CHECK-NEXT: cmp.w r5, r4, asr #31 -; CHECK-NEXT: cset r6, gt -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: asr.w r6, r4, #31 -; CHECK-NEXT: csel lr, r2, r4, ne -; CHECK-NEXT: cmp r2, r4 -; CHECK-NEXT: csel r8, r2, r4, hi -; CHECK-NEXT: cmp r5, r6 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r7, r8, lr, ne -; CHECK-NEXT: cmp.w r9, r1, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r7, r12 -; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r3, r0, #31 -; CHECK-NEXT: cmp.w r5, r4, asr #31 -; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r6, r2, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r3 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asr.w lr, r1, #31 +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: sbcs.w r1, lr, r3, asr #31 +; CHECK-NEXT: asr.w r5, r3, #31 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbcs.w r0, r12, r2, asr #31 +; CHECK-NEXT: bfi r3, r1, #0, #8 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov q1[3], q1[1], lr, r12 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: pop {r4, r5, r7, pc} %c = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %a, <2 x i32> %b) ret <2 x i32> %c } @@ -237,39 +228,26 @@ define arm_aapcs_vfpcc <2 x i64> @smax2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: smax2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: cset r0, gt +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: vmov r3, r2, d3 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r2, r1, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: csel r1, r2, r1, hi +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r12, d1 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r1, lr, ne -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r0, r4, d0 -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r5, gt -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r5, r0, r1, ne -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r6, eq -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, hi -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r0, r0, r5, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: csel r0, r3, r12, gt -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r1, r4, r2, gt -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr %c = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b) ret <2 x i64> %c } @@ -279,71 +257,49 @@ define arm_aapcs_vfpcc void @smax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-LABEL: smax4i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r2, r12, d7 -; CHECK-NEXT: vmov r3, lr, d3 -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: cset r1, gt +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov r1, r12, d2 +; CHECK-NEXT: vmov r3, r2, d6 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: sbcs.w r1, r2, r12 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r3, r2, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, hi -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r1, r2, r1, ne -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r6, gt -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r4, r2, ne -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r2, r4, r2, hi -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r2, r2, r6, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: csel r1, lr, r12, gt -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r2, r5, r3, gt -; CHECK-NEXT: vmov r3, r7, d1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov r2, r1, d5 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, r4, d7 +; CHECK-NEXT: subs.w r2, r2, lr +; CHECK-NEXT: sbcs.w r2, r4, r12 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vmov r4, r3, d4 +; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r6, gt -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r3, r2, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r5, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, hi -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r12, r2, r6, ne -; CHECK-NEXT: vmov r3, r6, d4 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: cset r2, gt +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: vmov r4, r3, d5 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel lr, r5, r3, ne -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: cset r2, eq -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r3, r5, r3, hi +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #0, #8 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r3, lr, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: csel r1, r7, r1, gt -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: csel r2, r4, r6, gt -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, pc} %c = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b) store <4 x i64> %c, <4 x i64>* %p ret void @@ -489,38 +445,29 @@ define arm_aapcs_vfpcc <2 x i32> @umax2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: umax2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: cset r3, hi -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r3, r2, r0, ne -; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r2, r0 -; CHECK-NEXT: csel r0, r2, r0, hi -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel lr, r0, r3, ne -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r3, r0, d0 -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r4, r3, r1, ne -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: cset r2, eq -; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: csel r1, r3, r1, hi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r1, r1, r4, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, lr -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: vmov r3, r2, d3 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r12, d1 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr %c = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %a, <2 x i32> %b) ret <2 x i32> %c } @@ -578,39 +525,26 @@ define arm_aapcs_vfpcc <2 x i64> @umax2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: umax2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: cset r0, hi +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: vmov r3, r2, d3 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r2, r1, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: csel r1, r2, r1, hi +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r12, d1 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r1, lr, ne -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r0, r4, d0 -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r5, r0, r1, ne -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r6, eq -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, hi -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r0, r0, r5, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: csel r0, r3, r12, hi -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r1, r4, r2, hi -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr %c = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b) ret <2 x i64> %c } @@ -620,71 +554,49 @@ define arm_aapcs_vfpcc void @umax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-LABEL: umax4i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r2, r12, d7 -; CHECK-NEXT: vmov r3, lr, d3 -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: cset r1, hi +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov r1, r12, d2 +; CHECK-NEXT: vmov r3, r2, d6 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: sbcs.w r1, r2, r12 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: cset r1, lo ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r3, r2, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, hi -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r1, r2, r1, ne -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r4, r2, ne -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r2, r4, r2, hi -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r2, r2, r6, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: csel r1, lr, r12, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r2, r5, r3, hi -; CHECK-NEXT: vmov r3, r7, d1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov r2, r1, d5 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, r4, d7 +; CHECK-NEXT: subs.w r2, r2, lr +; CHECK-NEXT: sbcs.w r2, r4, r12 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vmov r4, r3, d4 +; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r3, r2, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r5, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, hi -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r12, r2, r6, ne -; CHECK-NEXT: vmov r3, r6, d4 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: cset r2, hi +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: vmov r4, r3, d5 +; CHECK-NEXT: cset r2, lo ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel lr, r5, r3, ne -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: cset r2, eq -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r3, r5, r3, hi +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #0, #8 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: cset r2, lo ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r3, lr, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: csel r1, r7, r1, hi -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: csel r2, r4, r6, hi -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, pc} %c = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b) store <4 x i64> %c, <4 x i64>* %p ret void @@ -830,45 +742,36 @@ define arm_aapcs_vfpcc <2 x i32> @smin2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: smin2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: asr.w r9, r0, #31 -; CHECK-NEXT: cmp.w r9, r1, asr #31 -; CHECK-NEXT: cset r3, lt -; CHECK-NEXT: asrs r5, r2, #31 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: asr.w r3, r1, #31 -; CHECK-NEXT: csel r12, r0, r1, ne -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel lr, r0, r1, lo -; CHECK-NEXT: cmp r9, r3 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: csel r12, lr, r12, ne -; CHECK-NEXT: cmp.w r5, r4, asr #31 -; CHECK-NEXT: cset r6, lt -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: asr.w r6, r4, #31 -; CHECK-NEXT: csel lr, r2, r4, ne -; CHECK-NEXT: cmp r2, r4 -; CHECK-NEXT: csel r8, r2, r4, lo -; CHECK-NEXT: cmp r5, r6 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r7, r8, lr, ne -; CHECK-NEXT: cmp.w r9, r1, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r7, r12 -; CHECK-NEXT: it lt -; CHECK-NEXT: asrlt r3, r0, #31 -; CHECK-NEXT: cmp.w r5, r4, asr #31 -; CHECK-NEXT: it lt -; CHECK-NEXT: asrlt r6, r2, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r3 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov lr, s2 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: asrs r3, r1, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: asr.w r12, lr, #31 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: subs r3, r3, r1 +; CHECK-NEXT: sbcs.w r1, r2, r1, asr #31 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: subs.w r1, lr, r0 +; CHECK-NEXT: sbcs.w r0, r12, r0, asr #31 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: pop {r7, pc} %c = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %a, <2 x i32> %b) ret <2 x i32> %c } @@ -926,39 +829,26 @@ define arm_aapcs_vfpcc <2 x i64> @smin2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: smin2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: cmp r3, r12 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r2, r1, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: csel r1, r2, r1, lo +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r1, lr, ne -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r0, r4, d0 -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r5, lt -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r5, r0, r1, ne -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r6, eq -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, lo -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r0, r0, r5, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: csel r0, r3, r12, lt -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r1, r4, r2, lt -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr %c = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b) ret <2 x i64> %c } @@ -968,71 +858,49 @@ define arm_aapcs_vfpcc void @smin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-LABEL: smin4i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r2, r12, d7 -; CHECK-NEXT: vmov r3, lr, d3 -; CHECK-NEXT: cmp lr, r12 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov r1, r12, d6 +; CHECK-NEXT: vmov r3, r2, d2 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: sbcs.w r1, r2, r12 +; CHECK-NEXT: vmov lr, r12, d7 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r3, r2, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, lo -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r1, r2, r1, ne -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r6, lt -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r4, r2, ne -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r2, r4, r2, lo -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r2, r2, r6, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: csel r1, lr, r12, lt -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r2, r5, r3, lt -; CHECK-NEXT: vmov r3, r7, d1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov r2, r1, d5 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: subs.w r2, r2, lr +; CHECK-NEXT: sbcs.w r2, r4, r12 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmov r2, r12, d4 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vmov r4, r3, d0 +; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r6, lt -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r3, r2, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r5, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, lo -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r12, r2, r6, ne -; CHECK-NEXT: vmov r3, r6, d4 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: cmp r4, r6 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: vmov r4, r3, d1 ; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel lr, r5, r3, ne -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: cset r2, eq -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r3, r5, r3, lo +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #0, #8 +; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r3, lr, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: csel r1, r7, r1, lt -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: csel r2, r4, r6, lt -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, pc} %c = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b) store <4 x i64> %c, <4 x i64>* %p ret void @@ -1178,38 +1046,29 @@ define arm_aapcs_vfpcc <2 x i32> @umin2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: umin2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffffffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: cset r3, lo -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r3, r2, r0, ne -; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r2, r0 -; CHECK-NEXT: csel r0, r2, r0, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel lr, r0, r3, ne -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r3, r0, d0 -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: cset r4, lo -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r4, r3, r1, ne -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: cset r2, eq -; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: csel r1, r3, r1, lo -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r1, r1, r4, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, lr -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: vmov r3, r2, d1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr %c = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %a, <2 x i32> %b) ret <2 x i32> %c } @@ -1267,39 +1126,26 @@ define arm_aapcs_vfpcc <2 x i64> @umin2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: umin2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: cmp r3, r12 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r2, r1, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: csel r1, r2, r1, lo +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel lr, r1, lr, ne -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r0, r4, d0 -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r5, lo -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r5, r0, r1, ne -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: cset r6, eq -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, lo -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r0, r0, r5, ne -; CHECK-NEXT: cmp r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: csel r0, r3, r12, lo -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r1, r4, r2, lo -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr %c = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b) ret <2 x i64> %c } @@ -1309,71 +1155,49 @@ define arm_aapcs_vfpcc void @umin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-LABEL: umin4i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r2, r12, d7 -; CHECK-NEXT: vmov r3, lr, d3 -; CHECK-NEXT: cmp lr, r12 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov r1, r12, d6 +; CHECK-NEXT: vmov r3, r2, d2 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: sbcs.w r1, r2, r12 +; CHECK-NEXT: vmov lr, r12, d7 ; CHECK-NEXT: cset r1, lo ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r3, r2, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: cset r4, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, lo -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r1, r2, r1, ne -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r6, lo -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r4, r2, ne -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: csel r2, r4, r2, lo -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r2, r2, r6, ne -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: csel r1, lr, r12, lo -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r2, r5, r3, lo -; CHECK-NEXT: vmov r3, r7, d1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov r2, r1, d5 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: subs.w r2, r2, lr +; CHECK-NEXT: sbcs.w r2, r4, r12 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmov r2, r12, d4 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vmov r4, r3, d0 +; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r6, lo -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r6, r3, r2, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: cset r5, eq -; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: csel r2, r3, r2, lo -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r12, r2, r6, ne -; CHECK-NEXT: vmov r3, r6, d4 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: cmp r4, r6 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: vmov r4, r3, d1 ; CHECK-NEXT: cset r2, lo ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel lr, r5, r3, ne -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: cset r2, eq -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: csel r3, r5, r3, lo +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #0, #8 +; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: cset r2, lo ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csel r2, r3, lr, ne -; CHECK-NEXT: cmp r7, r1 -; CHECK-NEXT: csel r1, r7, r1, lo -; CHECK-NEXT: cmp r4, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: csel r2, r4, r6, lo -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r1, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, pc} %c = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b) store <4 x i64> %c, <4 x i64>* %p ret void diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll @@ -575,17 +575,21 @@ ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q2, q0, q1 -; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d4 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d5 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer @@ -598,32 +602,41 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r12, r2, d4 -; CHECK-NEXT: vmov r3, r1, d2 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: eor.w r2, r3, r12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov r12, r2, d5 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r3, r0, d3 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eor.w r2, r3, r12 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: ands r0, r2 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer @@ -636,30 +649,39 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqr_v2i1(<2 x i64> %a, <2 x i64> %b, i64 %c) { ; CHECK-LABEL: cmpeqr_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: eors r3, r1 ; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r12, r3, d2 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: eors r1, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: cset r12, ne +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: and.w r2, r2, r12 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r12, r2, d3 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -163,14 +163,18 @@ ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: and r1, r0, #2 -; CHECK-LE-NEXT: and r0, r0, #1 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: rsbs r0, r0, #0 -; CHECK-LE-NEXT: sub.w r1, r2, r1, lsr #1 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-LE-NEXT: vmov q1[3], q1[1], r0, r1 -; CHECK-LE-NEXT: vand q0, q0, q1 +; CHECK-LE-NEXT: and r0, r0, #3 +; CHECK-LE-NEXT: vmov.i8 q1, #0x0 +; CHECK-LE-NEXT: vmov.i8 q2, #0xff +; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vpsel q1, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q1[1] +; CHECK-LE-NEXT: vmov.u8 r1, q1[0] +; CHECK-LE-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-LE-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; @@ -178,15 +182,19 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: and r1, r0, #2 -; CHECK-BE-NEXT: and r0, r0, #1 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: rsbs r0, r0, #0 -; CHECK-BE-NEXT: sub.w r1, r2, r1, lsr #1 +; CHECK-BE-NEXT: rbit r0, r0 +; CHECK-BE-NEXT: vmov.i8 q1, #0x0 +; CHECK-BE-NEXT: vmov.i8 q2, #0xff +; CHECK-BE-NEXT: lsrs r0, r0, #30 +; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: vpsel q1, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q1[1] +; CHECK-BE-NEXT: vmov.u8 r1, q1[0] ; CHECK-BE-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-BE-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: vand q0, q0, q2 +; CHECK-BE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vpsel q0, q0, q1 ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: @@ -359,15 +367,17 @@ ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: orrs r0, r1 -; CHECK-LE-NEXT: vmov r1, r2, d1 ; CHECK-LE-NEXT: cset r0, eq +; CHECK-LE-NEXT: cmp r0, #0 +; CHECK-LE-NEXT: mov.w r0, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: bfi r0, r1, #0, #1 +; CHECK-LE-NEXT: vmov r1, r2, d1 ; CHECK-LE-NEXT: orrs r1, r2 ; CHECK-LE-NEXT: cset r1, eq ; CHECK-LE-NEXT: cmp r1, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r1, #1 -; CHECK-LE-NEXT: bfi r1, r0, #0, #1 -; CHECK-LE-NEXT: and r0, r1, #3 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: bfi r0, r1, #1, #1 ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; @@ -378,15 +388,17 @@ ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vmov r0, r1, d3 ; CHECK-BE-NEXT: orrs r0, r1 -; CHECK-BE-NEXT: vmov r1, r2, d2 ; CHECK-BE-NEXT: cset r0, eq +; CHECK-BE-NEXT: cmp r0, #0 +; CHECK-BE-NEXT: mov.w r0, #0 +; CHECK-BE-NEXT: csetm r1, ne +; CHECK-BE-NEXT: bfi r0, r1, #0, #1 +; CHECK-BE-NEXT: vmov r1, r2, d2 ; CHECK-BE-NEXT: orrs r1, r2 ; CHECK-BE-NEXT: cset r1, eq ; CHECK-BE-NEXT: cmp r1, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r1, #1 -; CHECK-BE-NEXT: bfi r1, r0, #0, #1 -; CHECK-BE-NEXT: and r0, r1, #3 +; CHECK-BE-NEXT: csetm r1, ne +; CHECK-BE-NEXT: bfi r0, r1, #1, #1 ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-const.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-const.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-build-const.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-const.ll @@ -156,26 +156,10 @@ define arm_aapcs_vfpcc <2 x i64> @build_upper_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: build_upper_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adr r0, .LCPI14_0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: adr r0, .LCPI14_1 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI14_0: -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .LCPI14_1: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: mov.w r0, #65280 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr entry: %s = select <2 x i1> , <2 x i64> %a, <2 x i64> %b ret <2 x i64> %s @@ -184,26 +168,10 @@ define arm_aapcs_vfpcc <2 x i64> @build_lower_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: build_lower_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adr r0, .LCPI15_0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: adr r0, .LCPI15_1 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI15_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .LCPI15_1: -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: movs r0, #255 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr entry: %s = select <2 x i1> , <2 x i64> %a, <2 x i64> %b ret <2 x i64> %s diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll @@ -159,19 +159,12 @@ ; CHECK-LABEL: build_var0_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: vldr s10, .LCPI9_0 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: vmov s8, r0 -; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vmov.f32 s9, s8 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %c = icmp ult i32 %s, %t %vc = insertelement <2 x i1> zeroinitializer, i1 %c, i64 0 @@ -183,19 +176,12 @@ ; CHECK-LABEL: build_var1_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: vldr s8, .LCPI10_0 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: vmov s10, r0 -; CHECK-NEXT: vmov.f32 s9, s8 -; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI10_0: -; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %c = icmp ult i32 %s, %t %vc = insertelement <2 x i1> zeroinitializer, i1 %c, i64 1 @@ -208,10 +194,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = icmp ult i32 %s, %t diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-const.ll b/llvm/test/CodeGen/Thumb2/mve-pred-const.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-const.ll @@ -1,6 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +define arm_aapcs_vfpcc i32 @build_v2i_v2i1_1() { +; CHECK-LABEL: build_v2i_v2i1_1: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #65535 +; CHECK-NEXT: bx lr + %r = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> ) + ret i32 %r +} +define arm_aapcs_vfpcc i32 @build_v2i_v2i1_0() { +; CHECK-LABEL: build_v2i_v2i1_0: +; CHECK: @ %bb.0: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: bx lr + %r = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> ) + ret i32 %r +} +define arm_aapcs_vfpcc i32 @build_v2i_v2i1_5() { +; CHECK-LABEL: build_v2i_v2i1_5: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov.w r0, #65280 +; CHECK-NEXT: bx lr + %r = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> ) + ret i32 %r +} + define arm_aapcs_vfpcc i32 @build_v2i_v4i1_1() { ; CHECK-LABEL: build_v2i_v4i1_1: ; CHECK: @ %bb.0: @@ -78,6 +103,46 @@ +define arm_aapcs_vfpcc <2 x i64> @build_i2v_v2i1_1() { +; CHECK-LABEL: build_i2v_v2i1_1: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #65535 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr + %c = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 65535) + %r = select <2 x i1> %c, <2 x i64> , <2 x i64> + ret <2 x i64> %r +} +define arm_aapcs_vfpcc <2 x i64> @build_i2v_v2i1_0() { +; CHECK-LABEL: build_i2v_v2i1_0: +; CHECK: @ %bb.0: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr + %c = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 0) + %r = select <2 x i1> %c, <2 x i64> , <2 x i64> + ret <2 x i64> %r +} +define arm_aapcs_vfpcc <2 x i64> @build_i2v_v2i1_5() { +; CHECK-LABEL: build_i2v_v2i1_5: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #61680 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr + %c = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 61680) + %r = select <2 x i1> %c, <2 x i64> , <2 x i64> + ret <2 x i64> %r +} + define arm_aapcs_vfpcc <4 x i32> @build_i2v_v4i1_1() { ; CHECK-LABEL: build_i2v_v4i1_1: ; CHECK: @ %bb.0: @@ -199,6 +264,15 @@ } +define arm_aapcs_vfpcc i32 @build_i2v2i_v2i1_5() { +; CHECK-LABEL: build_i2v2i_v2i1_5: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #61680 +; CHECK-NEXT: bx lr + %c = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 61680) + %r = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %c) + ret i32 %r +} define arm_aapcs_vfpcc i32 @build_i2v2i_v4i1_5() { ; CHECK-LABEL: build_i2v2i_v4i1_5: ; CHECK: @ %bb.0: @@ -228,6 +302,49 @@ } +define arm_aapcs_vfpcc <2 x i64> @build_v2i2v_v4i1_v2i1_5() { +; CHECK-LABEL: build_v2i2v_v4i1_v2i1_5: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #61680 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr + %b = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> ) + %c = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %b) + %r = select <2 x i1> %c, <2 x i64> , <2 x i64> zeroinitializer + ret <2 x i64> %r +} +define arm_aapcs_vfpcc <2 x i64> @build_v2i2v_v8i1_v2i1_5() { +; CHECK-LABEL: build_v2i2v_v8i1_v2i1_5: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #52428 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr + %b = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> ) + %c = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %b) + %r = select <2 x i1> %c, <2 x i64> , <2 x i64> zeroinitializer + ret <2 x i64> %r +} +define arm_aapcs_vfpcc <2 x i64> @build_v2i2v_v16i1_v2i1_5() { +; CHECK-LABEL: build_v2i2v_v16i1_v2i1_5: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #43690 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr + %b = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> ) + %c = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %b) + %r = select <2 x i1> %c, <2 x i64> , <2 x i64> zeroinitializer + ret <2 x i64> %r +} + define arm_aapcs_vfpcc <4 x i32> @build_v2i2v_v4i1_v4i1_5() { ; CHECK-LABEL: build_v2i2v_v4i1_v4i1_5: ; CHECK: @ %bb.0: @@ -357,10 +474,12 @@ ret <16 x i8> %r } +declare i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -105,24 +105,24 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI6_0 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpeq -; CHECK-NEXT: vmov r2, r1, d8 -; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: lsrs r0, r0, #5 -; CHECK-NEXT: csetm r6, ne -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpeq ; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: clz r1, r6 ; CHECK-NEXT: lsrs r0, r0, #5 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r6 +; CHECK-NEXT: lsrs r1, r1, #5 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 3 @@ -210,33 +210,28 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) { ; CHECK-LABEL: zext_v2i1_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r2, lr, d0 -; CHECK-NEXT: adr r3, .LCPI12_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vldr s1, .LCPI12_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: sbcs.w r0, r12, r1 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: sbcs.w r1, r12, lr +; CHECK-NEXT: sbcs.w r1, r12, r3 +; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %c = icmp sgt <2 x i64> %src, zeroinitializer %0 = zext <2 x i1> %c to <2 x i64> @@ -259,31 +254,29 @@ ; CHECK-NEXT: bl __aeabi_dcmpeq ; CHECK-NEXT: vmov r2, r1, d8 ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: adr r3, .LCPI13_1 -; CHECK-NEXT: lsrs r0, r0, #5 -; CHECK-NEXT: vldrw.u32 q4, [r3] ; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: csetm r6, ne +; CHECK-NEXT: vldr s17, .LCPI13_1 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: cset r6, ne ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: bl __aeabi_dcmpeq ; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: vmov s18, r6 +; CHECK-NEXT: vmov.f32 s19, s17 ; CHECK-NEXT: lsrs r0, r0, #5 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 -; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: vmov s16, r0 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} -; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI13_1: -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .LCPI13_0: ; CHECK-NEXT: .long 0 @ double 0 ; CHECK-NEXT: .long 0 +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %c = fcmp une <2 x double> %src, zeroinitializer %0 = zext <2 x i1> %c to <2 x i64> @@ -339,15 +332,18 @@ define arm_aapcs_vfpcc <2 x i64> @trunc_v2i1_v2i64(<2 x i64> %src) { ; CHECK-LABEL: trunc_v2i1_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: and r1, r1, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: and r1, r1, #1 -; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = trunc <2 x i64> %src to <2 x i1> @@ -472,45 +468,31 @@ define arm_aapcs_vfpcc <2 x double> @uitofp_v2i1_v2f64(<2 x i64> %src) { ; CHECK-LABEL: uitofp_v2i1_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r2, lr, d0 -; CHECK-NEXT: adr r3, .LCPI26_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: sbcs.w r0, r12, r1 +; CHECK-NEXT: cset r4, lt +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: sbcs.w r0, r12, r3 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: sbcs.w r1, r12, lr -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vand q4, q1, q0 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: bl __aeabi_ui2d +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: vmov d9, r0, r1 +; CHECK-NEXT: cset r2, ne ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 -; CHECK-NEXT: bl __aeabi_ul2d +; CHECK-NEXT: bl __aeabi_ui2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI26_0: -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: pop {r4, pc} entry: %c = icmp sgt <2 x i64> %src, zeroinitializer %0 = uitofp <2 x i1> %c to <2 x double> @@ -535,14 +517,12 @@ ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: bl __aeabi_l2d +; CHECK-NEXT: bl __aeabi_i2d ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: bl __aeabi_l2d +; CHECK-NEXT: bl __aeabi_i2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} @@ -556,30 +536,28 @@ define arm_aapcs_vfpcc <2 x double> @fptoui_v2i1_v2f64(<2 x double> %src) { ; CHECK-LABEL: fptoui_v2i1_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r0, r1, d8 -; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 -; CHECK-NEXT: adr r2, .LCPI28_0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r5 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: bl __aeabi_d2iz +; CHECK-NEXT: vmov r2, r1, d9 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: adr r3, .LCPI28_0 +; CHECK-NEXT: bfi r4, r0, #0, #8 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: bl __aeabi_d2iz +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r4, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q0, q5, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI28_0: @@ -596,24 +574,26 @@ define arm_aapcs_vfpcc <2 x double> @fptosi_v2i1_v2f64(<2 x double> %src) { ; CHECK-LABEL: fptosi_v2i1_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r0, r1, d8 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: adr r2, .LCPI29_0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r5 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: bl __aeabi_d2iz +; CHECK-NEXT: vmov r2, r1, d9 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: adr r3, .LCPI29_0 +; CHECK-NEXT: bfi r4, r0, #0, #8 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: bl __aeabi_d2iz +; CHECK-NEXT: bfi r4, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q0, q5, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI29_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll @@ -146,26 +146,35 @@ ; CHECK-LE-LABEL: load_v2i1: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: ldrb r0, [r0] -; CHECK-LE-NEXT: ubfx r1, r0, #1, #1 -; CHECK-LE-NEXT: and r0, r0, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: rsbs r0, r0, #0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-LE-NEXT: vmov q1[3], q1[1], r0, r1 -; CHECK-LE-NEXT: vand q0, q0, q1 +; CHECK-LE-NEXT: vmov.i8 q1, #0x0 +; CHECK-LE-NEXT: vmov.i8 q2, #0xff +; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vpsel q1, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q1[1] +; CHECK-LE-NEXT: vmov.u8 r1, q1[0] +; CHECK-LE-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-LE-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_v2i1: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: ldrb r0, [r0] -; CHECK-BE-NEXT: and r1, r0, #1 -; CHECK-BE-NEXT: ubfx r0, r0, #1, #1 -; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: rsbs r0, r0, #0 -; CHECK-BE-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r0, r1 -; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: vand q0, q0, q2 +; CHECK-BE-NEXT: vmov.i8 q1, #0x0 +; CHECK-BE-NEXT: vmov.i8 q2, #0xff +; CHECK-BE-NEXT: rbit r0, r0 +; CHECK-BE-NEXT: lsrs r0, r0, #30 +; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: vpsel q1, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q1[1] +; CHECK-BE-NEXT: vmov.u8 r1, q1[0] +; CHECK-BE-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-BE-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-BE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vpsel q0, q0, q1 ; CHECK-BE-NEXT: bx lr entry: %c = load <2 x i1>, <2 x i1>* %src @@ -320,16 +329,18 @@ ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vmov r1, r2, d0 ; CHECK-LE-NEXT: orrs r1, r2 -; CHECK-LE-NEXT: vmov r2, r3, d1 +; CHECK-LE-NEXT: mov.w r2, #0 ; CHECK-LE-NEXT: cset r1, eq -; CHECK-LE-NEXT: orrs r2, r3 -; CHECK-LE-NEXT: cset r2, eq -; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne ; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: strb r1, [r0] +; CHECK-LE-NEXT: vmov r1, r3, d1 +; CHECK-LE-NEXT: orrs r1, r3 +; CHECK-LE-NEXT: cset r1, eq +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: csetm r1, ne +; CHECK-LE-NEXT: bfi r2, r1, #1, #1 +; CHECK-LE-NEXT: strb r2, [r0] ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: store_v2i1: @@ -337,16 +348,18 @@ ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vmov r1, r2, d3 ; CHECK-BE-NEXT: orrs r1, r2 -; CHECK-BE-NEXT: vmov r2, r3, d2 +; CHECK-BE-NEXT: mov.w r2, #0 ; CHECK-BE-NEXT: cset r1, eq -; CHECK-BE-NEXT: orrs r2, r3 -; CHECK-BE-NEXT: cset r2, eq -; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 +; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: csetm r1, ne ; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: strb r1, [r0] +; CHECK-BE-NEXT: vmov r1, r3, d2 +; CHECK-BE-NEXT: orrs r1, r3 +; CHECK-BE-NEXT: cset r1, eq +; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: csetm r1, ne +; CHECK-BE-NEXT: bfi r2, r1, #1, #1 +; CHECK-BE-NEXT: strb r2, [r0] ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll @@ -323,17 +323,21 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q0, q0, q2 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer @@ -345,17 +349,21 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q0, q0, q2 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll @@ -377,26 +377,35 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vorr q2, q3, q2 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r1, #1 +; CHECK-NEXT: rsbs r0, r1, #0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r2, #1 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer @@ -409,32 +418,41 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r12, r2, d4 -; CHECK-NEXT: vmov r3, r1, d2 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: eor.w r2, r3, r12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vorr q2, q3, q2 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov r12, r2, d5 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r3, r0, d3 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eor.w r2, r3, r12 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -4,23 +4,23 @@ define <2 x i64> @shuffle1_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: shuffle1_v2i64: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs.w r1, r2, r3 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: bfi r3, r0, #8, #8 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: bx lr @@ -109,19 +109,23 @@ define <2 x i64> @shuffle2_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: shuffle2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: csetm r2, eq ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: orrs.w r0, r2, r3 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: bx lr @@ -202,14 +206,12 @@ ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: bx lr @@ -308,14 +310,12 @@ ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: bx lr @@ -416,22 +416,20 @@ ; CHECK-LABEL: shuffle5_b_v2i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: and r0, r0, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: bx lr @@ -446,22 +444,20 @@ ; CHECK-LABEL: shuffle5_t_v2i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: ubfx r1, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: bx lr @@ -619,32 +615,30 @@ define <4 x i32> @shuffle6_v2i64(<2 x i64> %src1, <2 x i64> %src2, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: shuffle6_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: csetm r2, eq ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: and r0, r0, #1 -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #0, #4 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: orrs.w r0, r2, r3 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #4, #4 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: and r0, r0, #1 -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #8, #4 ; CHECK-NEXT: vmov r0, r2, d1 ; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: and r0, r0, #1 -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #12, #4 ; CHECK-NEXT: add r0, sp, #32 ; CHECK-NEXT: vldrw.u32 q0, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-spill.ll b/llvm/test/CodeGen/Thumb2/mve-pred-spill.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-spill.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-spill.ll @@ -12,48 +12,67 @@ ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r7, lr} ; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LE-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LE-NEXT: vmov r0, r1, d1 +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vmov r0, r1, d0 +; CHECK-LE-NEXT: vmov q4, q1 ; CHECK-LE-NEXT: orrs r0, r1 -; CHECK-LE-NEXT: vmov r1, r2, d0 -; CHECK-LE-NEXT: csetm r0, eq -; CHECK-LE-NEXT: orrs r1, r2 -; CHECK-LE-NEXT: csetm r1, eq -; CHECK-LE-NEXT: vmov q5[2], q5[0], r1, r0 -; CHECK-LE-NEXT: vmov q5[3], q5[1], r1, r0 -; CHECK-LE-NEXT: vand q4, q1, q5 -; CHECK-LE-NEXT: vmov q0, q4 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: cset r0, eq +; CHECK-LE-NEXT: cmp r0, #0 +; CHECK-LE-NEXT: csetm r0, ne +; CHECK-LE-NEXT: bfi r1, r0, #0, #8 +; CHECK-LE-NEXT: vmov r0, r2, d1 +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: orrs r0, r2 +; CHECK-LE-NEXT: cset r0, eq +; CHECK-LE-NEXT: cmp r0, #0 +; CHECK-LE-NEXT: csetm r0, ne +; CHECK-LE-NEXT: bfi r1, r0, #8, #8 +; CHECK-LE-NEXT: vmsr p0, r1 +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill ; CHECK-LE-NEXT: bl ext_i64 -; CHECK-LE-NEXT: vbic q0, q0, q5 -; CHECK-LE-NEXT: vorr q0, q4, q0 -; CHECK-LE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-LE-NEXT: vpsel q0, q4, q0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: vpop {d8, d9} ; CHECK-LE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: shuffle1_v2i64: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r7, lr} ; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-BE-NEXT: vpush {d8, d9, d10, d11} -; CHECK-BE-NEXT: vrev64.32 q2, q0 -; CHECK-BE-NEXT: vmov r0, r1, d5 +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vmov q4, q1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov r0, r1, d2 +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 ; CHECK-BE-NEXT: orrs r0, r1 -; CHECK-BE-NEXT: vmov r1, r2, d4 -; CHECK-BE-NEXT: csetm r0, eq -; CHECK-BE-NEXT: orrs r1, r2 -; CHECK-BE-NEXT: csetm r1, eq -; CHECK-BE-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-BE-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-BE-NEXT: vrev64.32 q2, q0 -; CHECK-BE-NEXT: vmov.i8 q0, #0xff -; CHECK-BE-NEXT: vand q4, q1, q2 -; CHECK-BE-NEXT: veor q5, q2, q0 -; CHECK-BE-NEXT: vmov q0, q4 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: cset r0, eq +; CHECK-BE-NEXT: cmp r0, #0 +; CHECK-BE-NEXT: csetm r0, ne +; CHECK-BE-NEXT: bfi r1, r0, #0, #8 +; CHECK-BE-NEXT: vmov r0, r2, d3 +; CHECK-BE-NEXT: orrs r0, r2 +; CHECK-BE-NEXT: cset r0, eq +; CHECK-BE-NEXT: cmp r0, #0 +; CHECK-BE-NEXT: csetm r0, ne +; CHECK-BE-NEXT: bfi r1, r0, #8, #8 +; CHECK-BE-NEXT: vmsr p0, r1 +; CHECK-BE-NEXT: vpsel q0, q4, q0 +; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill ; CHECK-BE-NEXT: bl ext_i64 -; CHECK-BE-NEXT: vand q0, q0, q5 -; CHECK-BE-NEXT: vorr q0, q4, q0 -; CHECK-BE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-BE-NEXT: vpsel q0, q4, q0 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: vpop {d8, d9} ; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i64> %src, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll @@ -70,39 +70,43 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d4 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vbic q3, q3, q2 +; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vand q2, q4, q2 -; CHECK-NEXT: vorr q2, q2, q3 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csel r0, r1, r2, ne +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d5 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: cset r12, eq +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: vmov r3, r0, d3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: orrs r0, r3 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: csel r0, r2, r0, ne +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer @@ -182,39 +186,33 @@ define arm_aapcs_vfpcc <2 x i64> @cmpnez_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpnez_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d4 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vbic q3, q3, q2 +; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vand q2, q4, q2 -; CHECK-NEXT: vorr q2, q2, q3 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csel r0, r1, r2, ne +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d5 +; CHECK-NEXT: orr.w r12, r0, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: vmov r3, r0, d3 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: orrs r0, r3 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: csel r0, r2, r0, ne +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp ne <2 x i64> %a, zeroinitializer @@ -296,33 +294,23 @@ define arm_aapcs_vfpcc <2 x i64> @cmpsltz_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpsltz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vbic q3, q3, q2 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: cmp.w r3, r0, lsr #31 +; CHECK-NEXT: csel r0, r1, r2, ne +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: cmp.w r3, r1, lsr #31 +; CHECK-NEXT: bfi r3, r12, #0, #8 +; CHECK-NEXT: csel r0, r2, r0, ne ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vand q2, q4, q2 -; CHECK-NEXT: vorr q2, q2, q3 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp slt <2 x i64> %a, zeroinitializer @@ -405,39 +393,39 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: csetm lr, eq -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: csetm r4, eq -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: csetm r12, eq -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: csetm r2, eq +; CHECK-NEXT: orr.w r2, r0, r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: beq .LBB15_2 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r12, ne +; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: csetm lr, ne +; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov r1, r4, d1 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: orrs r1, r4 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: cbz r2, .LBB15_2 ; CHECK-NEXT: @ %bb.1: @ %select.false -; CHECK-NEXT: vmov q2[2], q2[0], r4, lr -; CHECK-NEXT: vmov q2[3], q2[1], r4, lr +; CHECK-NEXT: bfi r0, r12, #0, #8 +; CHECK-NEXT: bfi r0, lr, #8, #8 ; CHECK-NEXT: b .LBB15_3 ; CHECK-NEXT: .LBB15_2: -; CHECK-NEXT: vmov q2[2], q2[0], r2, r12 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 +; CHECK-NEXT: bfi r0, r3, #0, #8 +; CHECK-NEXT: bfi r0, r1, #8, #8 ; CHECK-NEXT: .LBB15_3: @ %select.end -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: and r0, r0, #1 -; CHECK-NEXT: and r1, r1, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: pop {r4, pc} entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll @@ -457,26 +457,35 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: veor q2, q3, q2 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne r1, r1, #1 +; CHECK-NEXT: rsbs r0, r1, #0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne r2, r2, #1 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer @@ -489,32 +498,41 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r12, r2, d4 -; CHECK-NEXT: vmov r3, r1, d2 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: eor.w r2, r3, r12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: veor q2, q3, q2 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov r12, r2, d5 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne r0, r0, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r3, r0, d3 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eor.w r2, r3, r12 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: eorne r0, r0, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -6,10 +6,6 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #0 @@ -27,60 +23,60 @@ ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bic r3, r3, #1 ; CHECK-NEXT: subs r7, r3, #2 -; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: adr r4, .LCPI0_1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w r11, r2, r3, lsl #2 ; CHECK-NEXT: add.w r10, r1, r3, lsl #2 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vmvn.i32 q1, #0x80000000 +; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r5, r4, [r0], #8 -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: ldrd r8, r7, [r1], #8 +; CHECK-NEXT: ldrd r4, r5, [r0], #8 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: ldrd r7, r8, [r1], #8 +; CHECK-NEXT: smull r8, r5, r8, r5 ; CHECK-NEXT: smull r4, r7, r7, r4 +; CHECK-NEXT: asrl r8, r5, #31 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: smull r6, r5, r8, r5 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 -; CHECK-NEXT: sbcs r3, r7 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r8 +; CHECK-NEXT: mov.w r9, #-1 +; CHECK-NEXT: sbcs.w r3, r9, r7 ; CHECK-NEXT: cset r3, lt -; CHECK-NEXT: asrl r6, r5, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r7, r5 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r4 -; CHECK-NEXT: csetm r9, ne -; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r7 -; CHECK-NEXT: sbcs r3, r5 -; CHECK-NEXT: mvn r6, #-2147483648 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r6, r3, #0, #8 +; CHECK-NEXT: rsbs.w r3, r8, #-2147483648 +; CHECK-NEXT: sbcs.w r3, r9, r5 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q2[2], q2[0], r3, r9 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r9 -; CHECK-NEXT: vbic q3, q0, q2 -; CHECK-NEXT: vand q2, q4, q2 -; CHECK-NEXT: vorr q2, q2, q3 +; CHECK-NEXT: bfi r6, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: mvn r6, #-2147483648 +; CHECK-NEXT: vpsel q2, q2, q0 ; CHECK-NEXT: vmov r3, r4, d4 ; CHECK-NEXT: subs r3, r3, r6 ; CHECK-NEXT: sbcs r3, r4, #0 -; CHECK-NEXT: vmov r4, r5, d5 +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: vmov r3, r5, d5 +; CHECK-NEXT: subs r3, r3, r6 +; CHECK-NEXT: sbcs r3, r5, #0 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q3[1], r3 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: sbcs r4, r5, #0 -; CHECK-NEXT: cset r4, lt -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q3[2], q3[0], r3, r4 -; CHECK-NEXT: vbic q4, q1, q3 -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vorr q2, q2, q4 +; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q2, q2, q1 ; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: strd r4, r3, [r2], #8 @@ -113,8 +109,6 @@ ; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.9: @@ -123,6 +117,11 @@ ; CHECK-NEXT: .long 4294967295 @ 0xffffffff ; CHECK-NEXT: .long 2147483648 @ 0x80000000 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 entry: switch i32 %N, label %vector.ph [ i32 0, label %for.cond.cleanup @@ -213,154 +212,154 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_3: @ %vector.ph -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: subs r7, r3, #4 ; CHECK-NEXT: adr r4, .LCPI1_0 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: adr r4, .LCPI1_1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r7, r1, r3, lsl #2 +; CHECK-NEXT: strd r7, r3, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: adr r4, .LCPI1_1 ; CHECK-NEXT: add.w r11, r2, r3, lsl #2 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r9, #-1 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov.w r2, #-1 -; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vmov.f32 s20, s14 -; CHECK-NEXT: vmov.f32 s22, s15 -; CHECK-NEXT: vmullb.s32 q6, q5, q4 -; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: vmov r4, r7, d13 +; CHECK-NEXT: vldrw.u32 q3, [r0], #16 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 +; CHECK-NEXT: mov.w r3, #-1 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: mov.w r6, #-1 +; CHECK-NEXT: vmov.f32 s20, s18 +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmullb.s32 q6, q5, q2 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov r4, r7, d12 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 -; CHECK-NEXT: sbcs.w r5, r2, r7 +; CHECK-NEXT: sbcs.w r5, r3, r7 ; CHECK-NEXT: cset r5, lt ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: vmov r10, r5, d12 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: bfi r8, r5, #0, #8 +; CHECK-NEXT: vmov r10, r5, d13 ; CHECK-NEXT: asrl r10, r5, #31 -; CHECK-NEXT: csetm r8, ne ; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 -; CHECK-NEXT: vmov q6[2], q6[0], r10, r4 -; CHECK-NEXT: sbcs.w r3, r2, r5 -; CHECK-NEXT: vmov q6[3], q6[1], r5, r7 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r10 +; CHECK-NEXT: sbcs.w r3, r6, r5 +; CHECK-NEXT: vmov q2[3], q2[1], r7, r5 ; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: mvn r10, #-2147483648 ; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q4[2], q4[0], r3, r8 -; CHECK-NEXT: vmov q4[3], q4[1], r3, r8 -; CHECK-NEXT: mvn r8, #-2147483648 -; CHECK-NEXT: vbic q5, q0, q4 -; CHECK-NEXT: vand q4, q6, q4 -; CHECK-NEXT: vorr q4, q4, q5 -; CHECK-NEXT: vmov r3, r4, d8 -; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: bfi r8, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r8 +; CHECK-NEXT: vpsel q2, q2, q0 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: subs.w r3, r3, r10 ; CHECK-NEXT: sbcs r3, r4, #0 -; CHECK-NEXT: vmov r4, r5, d9 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q5[1], r3 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: sbcs r4, r5, #0 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: cset r4, lt -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q5[2], q5[0], r3, r4 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov.f32 s10, s13 -; CHECK-NEXT: vbic q6, q1, q5 -; CHECK-NEXT: vand q4, q4, q5 -; CHECK-NEXT: vorr q4, q4, q6 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: smull r6, r5, r6, r5 -; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: smull r4, r7, r4, r3 -; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: vmov q5[2], q5[0], r6, r4 -; CHECK-NEXT: sbcs.w r3, r2, r7 -; CHECK-NEXT: vmov q5[3], q5[1], r5, r7 +; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: vmov r3, r5, d5 +; CHECK-NEXT: subs.w r3, r3, r10 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: rsbs.w r1, r6, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r5 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: ldrd r1, r2, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: vbic q3, q0, q2 -; CHECK-NEXT: vand q2, q5, q2 -; CHECK-NEXT: vorr q2, q2, q3 -; CHECK-NEXT: vmov r4, r3, d4 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vpsel q2, q2, q1 +; CHECK-NEXT: smull r8, r7, r4, r3 +; CHECK-NEXT: asrl r8, r7, #31 +; CHECK-NEXT: rsbs.w r3, r8, #-2147483648 +; CHECK-NEXT: sbcs.w r3, r9, r7 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov r3, r4, d5 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q3[1], r5 -; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r6, r3, #0, #8 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: smull r4, r5, r5, r3 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 +; CHECK-NEXT: vmov q3[2], q3[0], r8, r4 +; CHECK-NEXT: sbcs.w r3, r9, r5 +; CHECK-NEXT: vmov q3[3], q3[1], r7, r5 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r6, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: vpsel q3, q3, q0 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: subs.w r3, r3, r10 ; CHECK-NEXT: sbcs r3, r4, #0 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 -; CHECK-NEXT: vbic q5, q1, q3 -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vorr q2, q2, q5 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vmov.f32 s11, s18 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: vmov r3, r5, d7 +; CHECK-NEXT: subs.w r3, r3, r10 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q3, q3, q1 +; CHECK-NEXT: vmov.f32 s13, s14 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r7, r3, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21 ; CHECK-NEXT: sub.w lr, r3, r7 -; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: mov.w r1, #-1 ; CHECK-NEXT: mov.w r3, #-2147483648 ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r12], #4 -; CHECK-NEXT: ldr r4, [r9], #4 -; CHECK-NEXT: smull r4, r1, r4, r1 -; CHECK-NEXT: asrl r4, r1, #31 -; CHECK-NEXT: subs r5, r3, r4 -; CHECK-NEXT: sbcs.w r5, r0, r1 -; CHECK-NEXT: cset r5, lt -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: ldr r5, [r0], #4 +; CHECK-NEXT: smull r4, r5, r5, r4 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: subs r6, r3, r4 +; CHECK-NEXT: sbcs.w r6, r1, r5 +; CHECK-NEXT: cset r6, lt +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: csel r4, r4, r3, ne -; CHECK-NEXT: csel r1, r1, r0, ne -; CHECK-NEXT: subs r5, r4, r2 -; CHECK-NEXT: sbcs r1, r1, #0 -; CHECK-NEXT: csel r1, r4, r2, lt -; CHECK-NEXT: str r1, [r11], #4 +; CHECK-NEXT: csel r5, r5, r1, ne +; CHECK-NEXT: subs r6, r4, r2 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csel r4, r4, r2, lt +; CHECK-NEXT: str r4, [r11], #4 ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #16 @@ -464,8 +463,8 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -490,106 +489,109 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vdup.32 q4, r9 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: add.w r9, r9, #4 ; CHECK-NEXT: vorr q4, q4, q0 -; CHECK-NEXT: vptt.u32 cs, q1, q4 -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 -; CHECK-NEXT: vmov.f32 s24, s18 -; CHECK-NEXT: vmov.f32 s26, s19 -; CHECK-NEXT: vmov.f32 s28, s22 -; CHECK-NEXT: vmov.f32 s30, s23 -; CHECK-NEXT: vmullb.s32 q0, q7, q6 -; CHECK-NEXT: vmov r6, r5, d1 -; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 +; CHECK-NEXT: vcmp.u32 cs, q1, q4 +; CHECK-NEXT: vstr p0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q5, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q6, [r1], #16 +; CHECK-NEXT: vmov.f32 s16, s22 +; CHECK-NEXT: vmov.f32 s18, s23 +; CHECK-NEXT: vmov.f32 s28, s26 +; CHECK-NEXT: vmov.f32 s30, s27 +; CHECK-NEXT: vmullb.s32 q0, q7, q4 +; CHECK-NEXT: vmov.f32 s22, s21 +; CHECK-NEXT: vmov r10, r5, d0 +; CHECK-NEXT: asrl r10, r5, #31 +; CHECK-NEXT: rsbs.w r7, r10, #-2147483648 ; CHECK-NEXT: sbcs.w r7, r12, r5 ; CHECK-NEXT: cset r7, lt ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vmov r4, r7, d0 -; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: csetm r10, ne -; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: vmov q7[2], q7[0], r4, r6 +; CHECK-NEXT: csetm r7, ne +; CHECK-NEXT: bfi r4, r7, #0, #8 +; CHECK-NEXT: vmov r6, r7, d1 +; CHECK-NEXT: asrl r6, r7, #31 +; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 +; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 ; CHECK-NEXT: sbcs.w r3, r12, r7 -; CHECK-NEXT: vmov q7[3], q7[1], r7, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r7 ; CHECK-NEXT: cset r3, lt -; CHECK-NEXT: vmov r7, s20 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r10 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r10 -; CHECK-NEXT: vbic q6, q2, q0 -; CHECK-NEXT: vand q0, q7, q0 -; CHECK-NEXT: vorr q6, q0, q6 -; CHECK-NEXT: vmov r3, r4, d12 +; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q4, q0, q2 +; CHECK-NEXT: vmov.f32 s2, s25 +; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: vmov r7, s2 ; CHECK-NEXT: subs.w r3, r3, r8 ; CHECK-NEXT: sbcs r3, r4, #0 -; CHECK-NEXT: vmov r4, r5, d13 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: sbcs r4, r5, #0 -; CHECK-NEXT: cset r4, lt -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r4 -; CHECK-NEXT: vbic q7, q3, q0 -; CHECK-NEXT: vand q0, q6, q0 -; CHECK-NEXT: vorr q6, q0, q7 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.f32 s2, s21 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: smull r6, r5, r4, r3 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 +; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: vmov r3, r5, d9 +; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vpsel q4, q4, q3 +; CHECK-NEXT: smull r10, r5, r4, r3 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: asrl r10, r5, #31 +; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 ; CHECK-NEXT: sbcs.w r3, r12, r5 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r10, ne -; CHECK-NEXT: smull r4, r7, r7, r4 -; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r6 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: smull r6, r7, r7, r3 +; CHECK-NEXT: asrl r6, r7, #31 +; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 +; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 ; CHECK-NEXT: sbcs.w r3, r12, r7 -; CHECK-NEXT: vmov q5[3], q5[1], r7, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r7 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r10 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r10 -; CHECK-NEXT: vbic q4, q2, q0 -; CHECK-NEXT: vand q0, q5, q0 -; CHECK-NEXT: vorr q4, q0, q4 -; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q5, q0, q2 +; CHECK-NEXT: vmov r3, r4, d10 ; CHECK-NEXT: subs.w r3, r3, r8 ; CHECK-NEXT: sbcs r3, r4, #0 -; CHECK-NEXT: vmov r4, r5, d9 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: cset r3, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: sbcs r4, r5, #0 -; CHECK-NEXT: cset r4, lt -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r4 -; CHECK-NEXT: vbic q5, q3, q0 -; CHECK-NEXT: vand q0, q4, q0 -; CHECK-NEXT: vorr q0, q0, q5 +; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: vmov r3, r5, d11 +; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: cset r3, lt +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vpsel q0, q5, q3 +; CHECK-NEXT: vldr p0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s24 -; CHECK-NEXT: vmov.f32 s3, s26 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s18 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; CHECK-NEXT: .p2align 4 @@ -658,8 +660,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.1: @ %entry @@ -668,47 +670,51 @@ ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r11, r1 -; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB3_6 ; CHECK-NEXT: .LBB3_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #1 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bic r3, r3, #1 +; CHECK-NEXT: subs r7, r3, #2 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: subs r7, r5, #2 -; CHECK-NEXT: str r5, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r2, r5, lsl #2 -; CHECK-NEXT: add.w r11, r1, r5, lsl #2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r11, r2, r3, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 -; CHECK-NEXT: add.w r12, r0, r5, lsl #2 +; CHECK-NEXT: add.w r10, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r7, [r0], #8 -; CHECK-NEXT: ldrd r5, r10, [r1], #8 -; CHECK-NEXT: umull r4, r5, r5, r4 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: umull r6, r7, r10, r7 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: cset r5, lo -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: lsrl r6, r7, #31 -; CHECK-NEXT: csetm r9, ne -; CHECK-NEXT: subs.w r5, r6, #-1 -; CHECK-NEXT: vmov.32 q0[1], r9 -; CHECK-NEXT: sbcs r5, r7, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r6 +; CHECK-NEXT: ldrd r4, r6, [r0], #8 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: ldrd r7, r3, [r1], #8 +; CHECK-NEXT: umull r4, r9, r7, r4 +; CHECK-NEXT: lsrl r4, r9, #31 +; CHECK-NEXT: subs.w r5, r4, #-1 +; CHECK-NEXT: sbcs r5, r9, #0 ; CHECK-NEXT: cset r5, lo ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov q0[2], q0[0], r9, r5 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vorn q0, q1, q0 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: strd r5, r4, [r2], #8 +; CHECK-NEXT: bfi r8, r5, #0, #8 +; CHECK-NEXT: umull r6, r5, r3, r6 +; CHECK-NEXT: lsrl r6, r5, #31 +; CHECK-NEXT: subs.w r7, r6, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r6 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r9, r5 +; CHECK-NEXT: cset r3, lo +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r8, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r8 +; CHECK-NEXT: vpsel q1, q1, q0 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: strd r4, r3, [r2], #8 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: .LBB3_6: @ %for.body.preheader @@ -716,17 +722,17 @@ ; CHECK-NEXT: .LBB3_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 -; CHECK-NEXT: ldr r1, [r11], #4 +; CHECK-NEXT: ldr r1, [r10], #4 ; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: lsrl r0, r1, #31 ; CHECK-NEXT: subs.w r2, r0, #-1 ; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it hs ; CHECK-NEXT: movhs.w r0, #-1 -; CHECK-NEXT: str r0, [r8], #4 +; CHECK-NEXT: str r0, [r11], #4 ; CHECK-NEXT: le lr, .LBB3_7 ; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: switch i32 %N, label %vector.ph [ @@ -807,101 +813,116 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB4_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB4_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: mov r10, r2 +; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: b .LBB4_6 ; CHECK-NEXT: .LBB4_3: @ %vector.ph -; CHECK-NEXT: bic r8, r3, #3 +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r7, r3, #4 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: sub.w r7, r8, #4 -; CHECK-NEXT: add.w r10, r2, r8, lsl #2 -; CHECK-NEXT: add.w r9, r1, r8, lsl #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r10, r1, r3, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: add.w r12, r0, r8, lsl #2 +; CHECK-NEXT: add.w r7, r2, r3, lsl #2 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s12, s10 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmullb.u32 q5, q4, q2 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r4, r9, d10 +; CHECK-NEXT: lsrl r4, r9, #31 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: subs.w r5, r4, #-1 ; CHECK-NEXT: vmullb.u32 q4, q3, q1 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r4, r5, d8 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: vmov r6, r7, d9 -; CHECK-NEXT: cset r5, lo -; CHECK-NEXT: lsrl r6, r7, #31 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r6 -; CHECK-NEXT: csetm r11, ne -; CHECK-NEXT: subs.w r5, r6, #-1 -; CHECK-NEXT: sbcs r5, r7, #0 -; CHECK-NEXT: vmov.32 q1[1], r11 +; CHECK-NEXT: sbcs r5, r9, #0 ; CHECK-NEXT: cset r5, lo ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov q1[2], q1[0], r11, r5 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vorn q1, q3, q1 -; CHECK-NEXT: vmullb.u32 q3, q2, q0 -; CHECK-NEXT: vmov r4, r5, d6 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: vmov r6, r7, d7 -; CHECK-NEXT: cset r5, lo -; CHECK-NEXT: lsrl r6, r7, #31 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r6 -; CHECK-NEXT: csetm r11, ne -; CHECK-NEXT: subs.w r5, r6, #-1 +; CHECK-NEXT: bfi r6, r5, #0, #8 +; CHECK-NEXT: vmov r8, r5, d11 +; CHECK-NEXT: lsrl r8, r5, #31 +; CHECK-NEXT: subs.w r11, r8, #-1 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r8 +; CHECK-NEXT: sbcs r7, r5, #0 +; CHECK-NEXT: vmov q2[3], q2[1], r9, r5 +; CHECK-NEXT: cset r7, lo +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: csetm r7, ne +; CHECK-NEXT: bfi r6, r7, #8, #8 +; CHECK-NEXT: vmov r4, r7, d8 +; CHECK-NEXT: lsrl r4, r7, #31 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: subs.w r5, r4, #-1 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: sbcs r5, r7, #0 -; CHECK-NEXT: vmov.32 q0[1], r11 +; CHECK-NEXT: vpsel q2, q2, q0 ; CHECK-NEXT: cset r5, lo ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov q0[2], q0[0], r11, r5 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vorn q0, q2, q0 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: bfi r6, r5, #0, #8 +; CHECK-NEXT: vmov r2, r5, d9 +; CHECK-NEXT: lsrl r2, r5, #31 +; CHECK-NEXT: subs.w r3, r2, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r7, r5 +; CHECK-NEXT: cset r3, lo +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r6, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: vpsel q1, q1, q0 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB4_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r8, r3 +; CHECK-NEXT: ldrd r7, r3, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB4_8 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r8 +; CHECK-NEXT: sub.w lr, r3, r7 ; CHECK-NEXT: .LBB4_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 -; CHECK-NEXT: ldr r1, [r9], #4 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: lsrl r0, r1, #31 +; CHECK-NEXT: ldr r2, [r10], #4 +; CHECK-NEXT: umull r0, r3, r2, r0 +; CHECK-NEXT: lsrl r0, r3, #31 ; CHECK-NEXT: subs.w r2, r0, #-1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: it hs ; CHECK-NEXT: movhs.w r0, #-1 -; CHECK-NEXT: str r0, [r10], #4 +; CHECK-NEXT: str r0, [r1], #4 ; CHECK-NEXT: le lr, .LBB4_7 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll --- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -36,40 +36,58 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds r2, r2, r0 -; CHECK-NEXT: eor.w r12, r3, r1 -; CHECK-NEXT: adc.w r0, r3, r1 -; CHECK-NEXT: eor.w r1, r3, r0 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: bic.w r1, r1, r12 +; CHECK-NEXT: vmov r0, r2, d2 +; CHECK-NEXT: vmov r3, r1, d0 +; CHECK-NEXT: adds.w r12, r3, r0 +; CHECK-NEXT: vmov r0, r4, d1 +; CHECK-NEXT: adc.w lr, r1, r2 +; CHECK-NEXT: subs.w r3, r12, r3 +; CHECK-NEXT: sbcs.w r1, lr, r1 +; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov lr, r1, d2 -; CHECK-NEXT: cset r12, mi -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r2, r0, #31 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: eor.w r5, r4, r1 -; CHECK-NEXT: adcs r1, r4 -; CHECK-NEXT: eors r4, r1 -; CHECK-NEXT: bic.w r5, r4, r5 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: cset r5, mi -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, mi +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r3, r1, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: mov.w r2, #-2147483648 +; CHECK-NEXT: eorne r1, r1, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: adds r1, r1, r0 +; CHECK-NEXT: adc.w r5, r4, r3 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: sbcs.w r0, r5, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r1 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: asr.w r1, lr, #31 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov q0[3], q0[1], lr, r5 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cset r3, mi +; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: eorne.w r0, r2, r0, asr #31 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: eorne.w r1, r2, r1, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: eorne r0, r0, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: asrs r0, r5, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: adr r0, .LCPI3_0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: veor q1, q1, q2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI3_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 entry: %0 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 @@ -108,32 +126,32 @@ define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: uadd_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: adcs lr, r12, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r2, r5 -; CHECK-NEXT: adcs r5, r12, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r3, #-1 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r0, r4, d0 +; CHECK-NEXT: adc.w r12, r3, r1 +; CHECK-NEXT: subs.w r2, lr, r2 +; CHECK-NEXT: sbcs.w r2, r12, r3 +; CHECK-NEXT: vmov r3, r1, d2 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: adds r3, r3, r0 +; CHECK-NEXT: adcs r1, r4 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r1, r4 +; CHECK-NEXT: vmov q1[2], q1[0], r3, lr +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: vmov q1[3], q1[1], r1, r12 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: pop {r4, pc} entry: %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 @@ -175,38 +193,60 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: subs r2, r2, r0 -; CHECK-NEXT: eor.w r12, r3, r1 -; CHECK-NEXT: sbc.w r0, r3, r1 -; CHECK-NEXT: eor.w r1, r3, r0 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: ands.w r1, r1, r12 -; CHECK-NEXT: vmov lr, r1, d2 -; CHECK-NEXT: cset r12, mi -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r2, r0, #31 -; CHECK-NEXT: subs.w r3, r3, lr -; CHECK-NEXT: eor.w r5, r4, r1 -; CHECK-NEXT: sbc.w r1, r4, r1 -; CHECK-NEXT: eors r4, r1 -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: cset r5, mi -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r3, r1, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: mov.w r2, #-2147483648 +; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: rsbs r2, r1, #0 +; CHECK-NEXT: sbcs.w r2, r0, r3 +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: cset lr, lt +; CHECK-NEXT: subs.w r12, r2, r1 +; CHECK-NEXT: sbc.w r5, r4, r3 +; CHECK-NEXT: subs.w r2, r12, r2 +; CHECK-NEXT: sbcs.w r2, r5, r4 +; CHECK-NEXT: vmov r3, r4, d3 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp.w lr, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: eorne.w r0, r2, r0, asr #31 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: eorne r2, r2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: rsbs r1, r3, #0 +; CHECK-NEXT: sbcs.w r1, r0, r4 +; CHECK-NEXT: bfi r0, r2, #0, #8 +; CHECK-NEXT: vmov r2, r1, d1 +; CHECK-NEXT: cset lr, lt +; CHECK-NEXT: subs r3, r2, r3 +; CHECK-NEXT: sbc.w r4, r1, r4 +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: sbcs.w r1, r4, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r3 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp.w lr, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: eorne.w r1, r2, r1, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: eorne r1, r1, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: asrs r1, r5, #31 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: asrs r0, r4, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: adr r0, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: veor q1, q1, q2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 entry: %0 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 @@ -245,34 +285,32 @@ define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: usub_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: sbcs.w r1, r3, r1 -; CHECK-NEXT: adc r2, r12, #0 -; CHECK-NEXT: rsbs.w lr, r2, #1 -; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: subs r3, r4, r3 -; CHECK-NEXT: sbcs.w r2, r5, r2 -; CHECK-NEXT: adc r5, r12, #0 -; CHECK-NEXT: rsbs.w r5, r5, #1 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r3, #0 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: subs.w lr, r2, r0 +; CHECK-NEXT: vmov r0, r4, d0 +; CHECK-NEXT: sbc.w r12, r3, r1 +; CHECK-NEXT: subs.w r2, r2, lr +; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: vmov r3, r1, d2 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs r3, r0, r3 +; CHECK-NEXT: sbc.w r1, r4, r1 +; CHECK-NEXT: subs r0, r0, r3 +; CHECK-NEXT: sbcs.w r0, r4, r1 +; CHECK-NEXT: vmov q1[2], q1[0], r3, lr +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: vmov q1[3], q1[1], r1, r12 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: vbic q0, q1, q0 +; CHECK-NEXT: pop {r4, pc} entry: %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -182,72 +182,58 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d9} -; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov r7, s4 ; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov.f32 s8, s10 -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: asrs r4, r3, #31 -; CHECK-NEXT: subs.w r8, r3, r5 -; CHECK-NEXT: sbc.w r4, r4, r5, asr #31 -; CHECK-NEXT: asrs r5, r4, #31 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: bfi r4, r5, #0, #4 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: subs.w r9, r5, r7 -; CHECK-NEXT: asr.w r6, r5, #31 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 -; CHECK-NEXT: and.w r6, r12, r6, asr #31 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r4, r6, #4, #4 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: asrs r7, r6, #31 -; CHECK-NEXT: subs.w r10, r6, r3 -; CHECK-NEXT: asr.w r6, r5, #31 -; CHECK-NEXT: sbc.w r3, r7, r3, asr #31 -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: asr.w r11, r3, #31 -; CHECK-NEXT: and.w r3, r12, r3, asr #31 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: subs r5, r5, r7 -; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 -; CHECK-NEXT: asrs r6, r6, #31 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r11 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov q1[2], q1[0], r8, r5 -; CHECK-NEXT: vmov q1[3], q1[1], r9, r10 -; CHECK-NEXT: and r6, r6, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r4, r6, #8, #4 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: subs.w r8, r3, r4 +; CHECK-NEXT: sbc.w r12, r12, r4, asr #31 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vmov r6, s10 +; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: subs r4, r4, r6 +; CHECK-NEXT: sbc.w r9, r3, r6, asr #31 +; CHECK-NEXT: vmov r6, s8 +; CHECK-NEXT: subs r5, r7, r6 +; CHECK-NEXT: vmov q2[2], q2[0], r5, r8 +; CHECK-NEXT: asr.w r5, r7, #31 +; CHECK-NEXT: sbc.w r5, r5, r6, asr #31 +; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: vmov r7, s6 +; CHECK-NEXT: subs r3, r7, r6 +; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 +; CHECK-NEXT: asr.w r3, r5, #31 +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: bfi r4, r3, #0, #4 +; CHECK-NEXT: asr.w r3, r9, #31 +; CHECK-NEXT: bfi r4, r3, #4, #4 +; CHECK-NEXT: asr.w r3, r12, #31 +; CHECK-NEXT: bfi r4, r3, #8, #4 +; CHECK-NEXT: asr.w r3, r7, #31 +; CHECK-NEXT: sbc.w r3, r3, r6, asr #31 +; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: bfi r4, r3, #12, #4 ; CHECK-NEXT: vmsr p0, r4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.i32 q1, q0, q1 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vsubt.i32 q2, q0, q2 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: br label %vector.body @@ -368,10 +354,8 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: mov.w lr, #256 @@ -379,65 +363,53 @@ ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vmov.f32 s14, s9 -; CHECK-NEXT: vand q4, q3, q0 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vmov r3, r4, d8 -; CHECK-NEXT: vmov.f32 s20, s12 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vand q5, q5, q0 -; CHECK-NEXT: vmov.f32 s8, s10 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: vmov.f32 s10, s11 -; CHECK-NEXT: vmov.f32 s12, s14 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 +; CHECK-NEXT: vldrw.u32 q5, [r0], #16 +; CHECK-NEXT: vmov.f32 s8, s18 +; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s12, s22 ; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov.f32 s14, s15 +; CHECK-NEXT: vmov.f32 s14, s23 ; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: subs.w r8, r5, r3 -; CHECK-NEXT: vmov r7, r3, d11 -; CHECK-NEXT: sbc.w r4, r6, r4 -; CHECK-NEXT: asrs r5, r4, #31 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: bfi r4, r5, #0, #4 -; CHECK-NEXT: vmov r5, r6, d9 -; CHECK-NEXT: subs.w r9, r7, r5 -; CHECK-NEXT: mov.w r7, #1 -; CHECK-NEXT: sbcs r3, r6 -; CHECK-NEXT: and.w r3, r7, r3, asr #31 +; CHECK-NEXT: vmov r3, r12, d4 +; CHECK-NEXT: vmov r4, r5, d6 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vmov.f32 s22, s21 +; CHECK-NEXT: vand q4, q4, q0 +; CHECK-NEXT: vand q5, q5, q0 +; CHECK-NEXT: vmov r6, r7, d11 +; CHECK-NEXT: subs.w r8, r4, r3 +; CHECK-NEXT: sbc.w r12, r5, r12 +; CHECK-NEXT: vmov r5, r3, d9 +; CHECK-NEXT: subs.w r10, r6, r5 +; CHECK-NEXT: sbc.w r9, r7, r3 +; CHECK-NEXT: vmov r6, r7, d8 +; CHECK-NEXT: vmov r4, r3, d10 +; CHECK-NEXT: subs r4, r4, r6 +; CHECK-NEXT: sbcs r3, r7 +; CHECK-NEXT: vmov q4[2], q4[0], r4, r8 +; CHECK-NEXT: vmov r4, r6, d5 ; CHECK-NEXT: vmov r7, r5, d7 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r4, r3, #4, #4 -; CHECK-NEXT: vmov r3, r6, d5 -; CHECK-NEXT: subs.w r10, r7, r3 -; CHECK-NEXT: vmov r7, r3, d4 -; CHECK-NEXT: sbcs r5, r6 -; CHECK-NEXT: vmov r6, r12, d6 -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: subs r6, r6, r7 -; CHECK-NEXT: sbc.w r3, r12, r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r11 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov q2[2], q2[0], r8, r6 -; CHECK-NEXT: vmov q2[3], q2[1], r9, r10 -; CHECK-NEXT: and r3, r3, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: subs r4, r7, r4 +; CHECK-NEXT: vmov q4[3], q4[1], r10, r4 +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: bfi r4, r3, #0, #4 +; CHECK-NEXT: asr.w r3, r9, #31 +; CHECK-NEXT: bfi r4, r3, #4, #4 +; CHECK-NEXT: asr.w r3, r12, #31 ; CHECK-NEXT: bfi r4, r3, #8, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: and.w r3, r3, r5, asr #31 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: sbc.w r3, r5, r6 +; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: bfi r4, r3, #12, #4 ; CHECK-NEXT: vmsr p0, r4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.i32 q2, q1, q2 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vsubt.i32 q4, q1, q4 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 ; CHECK-NEXT: le lr, .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll @@ -367,23 +367,27 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, <2 x i64> %srcb, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_eq_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r12, r2, d2 -; CHECK-NEXT: vmov r3, r1, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: vmov r12, r2, d3 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r3, r0, d1 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: eor.w r2, r3, r12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q1, q3, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q2, q3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, %srcb @@ -394,25 +398,25 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_slt_v2i64(<2 x i64> %src, <2 x i64> %srcb, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_slt_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: sbcs.w r0, r3, r1 -; CHECK-NEXT: vmov r1, r12, d2 -; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r3, r1 -; CHECK-NEXT: sbcs.w r1, r2, r12 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q1, q3, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: subs r0, r3, r0 +; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q2, q3 ; CHECK-NEXT: bx lr entry: %c = icmp slt <2 x i64> %src, %srcb @@ -423,23 +427,27 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, <2 x i64> %srcb, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_eq_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r12, r2, d2 -; CHECK-NEXT: vmov r3, r1, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: vmov r12, r2, d3 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r3, r0, d1 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: eor.w r2, r3, r12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q1, q3, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q2, q3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, %srcb @@ -450,58 +458,60 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { ; CHECK-LABEL: vcmp_multi_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q0, q2, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: subs r3, r0, r2 -; CHECK-NEXT: asr.w r1, r0, #31 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: sbcs.w r1, r1, r2, asr #31 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm lr, ne -; CHECK-NEXT: subs r1, r2, r3 -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: sbcs.w r1, r12, r3, asr #31 -; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r1, lr -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: subs r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r2, r12, r3, asr #31 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, lr -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: cset r2, ne ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q1, q3, q1 -; CHECK-NEXT: vbic q0, q0, q1 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cset r3, ne +; CHECK-NEXT: ands r1, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: subs r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r2, r12, r3, asr #31 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cset r3, ne +; CHECK-NEXT: ands r1, r3 +; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: bx lr %a4 = icmp eq <2 x i64> %a, zeroinitializer %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c %a6 = icmp ne <2 x i32> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll @@ -433,21 +433,25 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_eq_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r3, r1 ; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r12, r3, d0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: eors r1, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r12, r2, d1 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -460,21 +464,25 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_eq_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r3, r1 ; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r12, r3, d0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: eors r1, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r12, r2, d1 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -487,58 +495,60 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { ; CHECK-LABEL: vcmp_multi_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q0, q2, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: subs r3, r0, r2 -; CHECK-NEXT: asr.w r1, r0, #31 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: sbcs.w r1, r1, r2, asr #31 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm lr, ne -; CHECK-NEXT: subs r1, r2, r3 -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: sbcs.w r1, r12, r3, asr #31 -; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r1, lr -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: subs r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r2, r12, r3, asr #31 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, lr -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: cset r2, ne ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q1, q3, q1 -; CHECK-NEXT: vbic q0, q0, q1 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cset r3, ne +; CHECK-NEXT: ands r1, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: subs r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r2, r12, r3, asr #31 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cset r3, ne +; CHECK-NEXT: ands r1, r3 +; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: bx lr %a4 = icmp eq <2 x i64> %a, zeroinitializer %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c %a6 = icmp ne <2 x i32> %b, zeroinitializer @@ -984,21 +994,25 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_r_eq_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r3, r1 ; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r12, r3, d0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: eors r1, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r12, r2, d1 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -1011,21 +1025,25 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_r_eq_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r3, r1 ; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r12, r3, d0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: eors r1, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r12, r2, d1 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -1038,58 +1056,60 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { ; CHECK-LABEL: vcmp_r_multi_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q0, q2, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: subs r3, r0, r2 -; CHECK-NEXT: asr.w r1, r0, #31 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: sbcs.w r1, r1, r2, asr #31 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm lr, ne -; CHECK-NEXT: subs r1, r2, r3 -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: sbcs.w r1, r12, r3, asr #31 -; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r1, lr -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: subs r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r2, r12, r3, asr #31 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, lr -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: cset r2, ne ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q1, q3, q1 -; CHECK-NEXT: vbic q0, q0, q1 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cset r3, ne +; CHECK-NEXT: ands r1, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: subs r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r2, r12, r3, asr #31 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: cset r2, lt +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cset r3, ne +; CHECK-NEXT: ands r1, r3 +; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: bx lr %a4 = icmp eq <2 x i64> %a, zeroinitializer %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c %a6 = icmp ne <2 x i32> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll @@ -361,17 +361,21 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_eqz_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, zeroinitializer @@ -382,17 +386,21 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_eqz_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, zeroinitializer @@ -763,17 +771,21 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_r_eqz_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> zeroinitializer, %src @@ -784,17 +796,21 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_r_eqz_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vctp.ll b/llvm/test/CodeGen/Thumb2/mve-vctp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll @@ -175,22 +175,10 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_ult_v2i64(i64 %n, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_ult_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: rsbs.w r3, r0, #1 -; CHECK-NEXT: mov.w r2, #0 -; CHECK-NEXT: sbcs.w r3, r2, r1 -; CHECK-NEXT: cset r3, lo -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: sbcs.w r0, r2, r1 -; CHECK-NEXT: cset r0, lo -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vctp.64 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %n, i32 0 @@ -203,23 +191,11 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_uge_v2i64(i64 %n, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_uge_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #1 -; CHECK-NEXT: vldr s8, .LCPI11_0 -; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: cset r0, hs -; CHECK-NEXT: vmov.f32 s9, s8 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov s10, r0 -; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vctp.64 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI11_0: -; CHECK-NEXT: .long 0xffffffff @ float NaN entry: %i = insertelement <2 x i64> undef, i64 %n, i32 0 %ns = shufflevector <2 x i64> %i, <2 x i64> undef, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -45,16 +45,24 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %b) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -77,15 +85,22 @@ ; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -176,96 +191,89 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-LABEL: add_v8i16_v8i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.i8 q3, #0x0 +; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.u16 r1, q2[0] +; CHECK-NEXT: vpsel q5, q4, q3 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.u16 r1, q5[0] ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.u16 r1, q2[1] +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vmov.u16 r1, q5[1] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q6, q4, q3 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xffff -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r12, r2, d7 -; CHECK-NEXT: vmov r3, r1, d6 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: add.w r2, r3, r12 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vand q7, q2, q1 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vpsel q7, q7, q2 +; CHECK-NEXT: vmov r0, r1, d15 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vand q6, q6, q1 +; CHECK-NEXT: vpsel q6, q6, q2 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vpsel q3, q4, q3 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r0, r3, d6 +; CHECK-NEXT: vpsel q4, q4, q2 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, r3, d4 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d5 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -278,100 +286,96 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-LABEL: add_v8i16_v8i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr -; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmov.s16 r1, q0[1] -; CHECK-NEXT: vmov.s16 r2, q0[0] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.u16 r1, q4[0] +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.u16 r1, q4[1] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov.s16 r0, q0[1] +; CHECK-NEXT: vmov.s16 r1, q0[0] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 +; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r1, r12, d5 -; CHECK-NEXT: vmov r3, r2, d4 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.s16 r0, q0[3] +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov q6[3], q6[1], r1, r0 +; CHECK-NEXT: vpsel q6, q6, q1 +; CHECK-NEXT: vmov r0, r1, d13 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: vmov.s16 r2, q0[3] ; CHECK-NEXT: vmov.s16 r3, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, r3, d4 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vpsel q5, q5, q1 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.u16 r3, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r3 -; CHECK-NEXT: vmov.s16 r0, q0[5] +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q4[4] +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q4[7] +; CHECK-NEXT: vmov.u16 r3, q4[5] +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.s16 r2, q0[5] ; CHECK-NEXT: vmov.s16 r3, q0[4] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r0, r3, d2 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d3 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov.s16 r1, q0[7] -; CHECK-NEXT: vmov.s16 r2, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vpsel q3, q3, q1 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.s16 r2, q0[7] +; CHECK-NEXT: vmov.s16 r3, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -417,19 +421,28 @@ ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xffff +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r1, d0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -443,16 +456,23 @@ ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xffff +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r1, r1 @@ -460,7 +480,7 @@ ; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -650,115 +670,325 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q5, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.u8 r0, q5[0] +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q5[1] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u8 r0, q5[2] +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q5[3] +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q5[4] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u8 r0, q5[5] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u8 r0, q5[6] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q5[7] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vpsel q6, q1, q0 +; CHECK-NEXT: vmov.u16 r0, q6[2] +; CHECK-NEXT: vmov.u16 r1, q6[0] +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q6[3] +; CHECK-NEXT: vmov.u16 r1, q6[1] +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vpsel q7, q1, q0 +; CHECK-NEXT: vmov r0, r1, d14 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r0, r1 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov.u8 r1, q2[0] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov.i64 q3, #0xff +; CHECK-NEXT: vand q0, q4, q3 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, r3, d15 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[3] +; CHECK-NEXT: vmov.u8 r3, q2[2] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q6[6] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q6[4] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q6[7] +; CHECK-NEXT: vmov.u16 r3, q6[5] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[5] +; CHECK-NEXT: vmov.u8 r3, q2[4] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[7] +; CHECK-NEXT: vmov.u8 r3, q2[6] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q5[8] +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.u8 r2, q5[9] +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov.u8 r2, q5[10] +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.u8 r2, q5[11] +; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: vmov.u8 r2, q5[12] +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.u8 r2, q5[13] +; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.u8 r2, q5[14] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u8 r2, q5[15] +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vpsel q5, q1, q7 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.u16 r3, q5[0] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[9] +; CHECK-NEXT: vmov.u8 r3, q2[8] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[11] +; CHECK-NEXT: vmov.u8 r3, q2[10] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q1, q1, q7 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[13] +; CHECK-NEXT: vmov.u8 r3, q2[12] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[15] +; CHECK-NEXT: vmov.u8 r3, q2[14] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = zext <16 x i8> %x to <16 x i64> + %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) + ret i64 %z +} + +define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) { +; CHECK-LABEL: add_v16i8_v16i64_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vcmp.i8 eq, q1, zr +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vpsel q4, q2, q1 ; CHECK-NEXT: vmov.u8 r0, q4[0] -; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q4[1] -; CHECK-NEXT: vmov.16 q1[1], r0 +; CHECK-NEXT: vmov.16 q3[1], r0 ; CHECK-NEXT: vmov.u8 r0, q4[2] -; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.16 q3[2], r0 ; CHECK-NEXT: vmov.u8 r0, q4[3] -; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.16 q3[3], r0 ; CHECK-NEXT: vmov.u8 r0, q4[4] -; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov.u8 r0, q4[5] -; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vmov.u8 r0, q4[6] -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u8 r0, q4[7] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vcmp.i16 ne, q1, zr -; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vpsel q5, q2, q1 ; CHECK-NEXT: vmov.u16 r0, q5[2] ; CHECK-NEXT: vmov.u16 r1, q5[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q5[3] ; CHECK-NEXT: vmov.u16 r1, q5[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vcmp.i32 ne, q1, zr -; CHECK-NEXT: vmov.i64 q1, #0xff -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 -; CHECK-NEXT: vand q7, q7, q1 -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r12, r2, d13 -; CHECK-NEXT: vmov r3, r1, d12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: add.w r2, r3, r12 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov q7[2], q7[0], r3, r0 -; CHECK-NEXT: vand q7, q7, q1 -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r0, r3, d12 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vpsel q6, q2, q1 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r0, r1 +; CHECK-NEXT: vmov.s8 r0, q0[1] +; CHECK-NEXT: vmov.s8 r1, q0[0] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q7[2], q7[0], r1, r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov q7[3], q7[1], r1, r0 +; CHECK-NEXT: vpsel q7, q7, q3 +; CHECK-NEXT: vmov r0, r1, d15 +; CHECK-NEXT: vmov r2, r3, d14 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[3] +; CHECK-NEXT: vmov.s8 r3, q0[2] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vpsel q6, q6, q3 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r0 -; CHECK-NEXT: vand q6, q6, q1 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r0, r3, d10 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d11 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: vand q6, q6, q1 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r1, r2, d10 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d11 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vpsel q5, q2, q1 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[5] +; CHECK-NEXT: vmov.s8 r3, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vpsel q6, q6, q3 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[7] +; CHECK-NEXT: vmov.s8 r3, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vpsel q5, q5, q3 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -775,172 +1005,50 @@ ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.u16 r3, q2[0] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.u16 r3, q2[1] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 -; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r0, r3, d6 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d7 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, r2, d6 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: adc.w r12, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, r3, d4 -; CHECK-NEXT: adds.w lr, r1, r0 -; CHECK-NEXT: adc.w r1, r12, r3 -; CHECK-NEXT: vmov r3, r0, d5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r7, pc} -entry: - %c = icmp eq <16 x i8> %b, zeroinitializer - %xx = zext <16 x i8> %x to <16 x i64> - %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer - %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) - ret i64 %z -} - -define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) { -; CHECK-LABEL: add_v16i8_v16i64_sext: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q1, #0x0 -; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vpsel q3, q2, q1 -; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q4, q2, q1 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.u16 r1, q4[0] -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.u16 r1, q4[1] -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 +; CHECK-NEXT: vmov.u16 r2, q4[2] +; CHECK-NEXT: vmov.u16 r3, q4[0] +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vmov.u16 r3, q4[1] +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 -; CHECK-NEXT: vmov.s8 r1, q0[1] -; CHECK-NEXT: vmov.s8 r2, q0[0] -; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vpsel q5, q2, q1 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[9] +; CHECK-NEXT: vmov.s8 r3, q0[8] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r1, r12, d11 -; CHECK-NEXT: vmov r3, r2, d10 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 -; CHECK-NEXT: vmov.s8 r0, q0[3] -; CHECK-NEXT: vmov.s8 r3, q0[2] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r0 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r0, r3, d10 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vpsel q6, q6, q3 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[11] +; CHECK-NEXT: vmov.s8 r3, q0[10] +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vpsel q5, q5, q3 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d11 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q4[6] ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q4[4] @@ -949,156 +1057,42 @@ ; CHECK-NEXT: vmov.u16 r3, q4[5] ; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r0, r3 -; CHECK-NEXT: vmov.s8 r0, q0[5] -; CHECK-NEXT: vmov.s8 r3, q0[4] -; CHECK-NEXT: vmov q5[2], q5[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r3, r0 -; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r0, r3, d8 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d9 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 -; CHECK-NEXT: vmov.s8 r1, q0[7] -; CHECK-NEXT: vmov.s8 r2, q0[6] -; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 -; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r1, r2, d8 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d9 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vmov.u8 r2, q3[8] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u8 r2, q3[9] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u8 r2, q3[10] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q3[11] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u8 r2, q3[12] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u8 r2, q3[13] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.u8 r2, q3[14] -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.u8 r2, q3[15] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.u16 r3, q1[0] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.u16 r3, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[13] +; CHECK-NEXT: vmov.s8 r3, q0[12] ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.s8 r0, q0[9] -; CHECK-NEXT: vmov.s8 r3, q0[8] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, r3, d4 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d5 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmov.s8 r1, q0[11] -; CHECK-NEXT: vmov.s8 r2, q0[10] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r1, r2, d4 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d5 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.u16 r3, q1[4] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.u16 r3, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r3 -; CHECK-NEXT: vmov.s8 r0, q0[13] -; CHECK-NEXT: vmov.s8 r3, q0[12] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r0, r3, d2 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d3 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov.s8 r1, q0[15] -; CHECK-NEXT: vmov.s8 r2, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[15] +; CHECK-NEXT: vmov.s8 r3, q0[14] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer @@ -1111,98 +1105,91 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i8 q3, #0x0 ; CHECK-NEXT: vcmp.i16 eq, q1, zr -; CHECK-NEXT: vmov.i8 q1, #0x0 -; CHECK-NEXT: vpsel q2, q2, q1 +; CHECK-NEXT: vmov.i8 q4, #0xff +; CHECK-NEXT: vpsel q5, q4, q3 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.u16 r1, q2[0] +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.u16 r1, q5[0] ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.u16 r1, q2[1] +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vmov.u16 r1, q5[1] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q6, q4, q3 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xffff -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r12, r2, d7 -; CHECK-NEXT: vmov r3, r1, d6 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: add.w r2, r3, r12 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vand q7, q2, q1 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vpsel q7, q7, q2 +; CHECK-NEXT: vmov r0, r1, d15 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vand q6, q6, q1 +; CHECK-NEXT: vpsel q6, q6, q2 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vpsel q3, q4, q3 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r0, r3, d6 +; CHECK-NEXT: vpsel q4, q4, q2 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, r3, d4 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d5 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer @@ -1215,109 +1202,105 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vcmp.i16 eq, q1, zr -; CHECK-NEXT: vmov.i8 q1, #0x0 -; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.u16 r1, q4[0] +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.u16 r1, q4[1] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 +; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r1, r12, d5 -; CHECK-NEXT: vmov r3, r2, d4 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov q6[3], q6[1], r1, r0 +; CHECK-NEXT: vpsel q6, q6, q1 +; CHECK-NEXT: vmov r0, r1, d13 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, r3, d4 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vpsel q5, q5, q1 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.u16 r3, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q4[4] +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q4[7] +; CHECK-NEXT: vmov.u16 r3, q4[5] +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[5] ; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r0, r3, d2 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r0, d3 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: ubfx r1, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vpsel q3, q3, q1 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer @@ -1366,19 +1349,28 @@ ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xff +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r1, d0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1392,16 +1384,23 @@ ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 @@ -1409,7 +1408,7 @@ ; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -1426,15 +1425,22 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) { ; CHECK-LABEL: add_v2i64_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d3 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -1496,16 +1502,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -1533,15 +1547,22 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -1640,101 +1661,94 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.i8 q3, #0x0 +; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.u16 r3, q2[0] +; CHECK-NEXT: vpsel q5, q4, q3 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.u16 r3, q5[0] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.u16 r3, q2[1] +; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.u16 r3, q5[1] ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q1, zr -; CHECK-NEXT: vmov.i64 q1, #0xffff -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 -; CHECK-NEXT: vmov.u16 r12, q0[1] +; CHECK-NEXT: vpsel q6, q4, q3 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r12 -; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r12, lr, d7 -; CHECK-NEXT: vmov r3, r4, d6 -; CHECK-NEXT: orr.w lr, lr, r4 -; CHECK-NEXT: ubfx r4, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: add r3, r12 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r4 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vand q7, q2, q1 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vpsel q7, q7, q2 +; CHECK-NEXT: vmov r12, lr, d15 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: orr.w lr, lr, r3 +; CHECK-NEXT: add r12, r2 +; CHECK-NEXT: vmov r3, r2, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r4, q0[2] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vand q6, q6, q1 +; CHECK-NEXT: vpsel q6, q6, q2 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vpsel q3, q4, q3 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r2, r4, d6 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: adc.w r3, lr, r4 -; CHECK-NEXT: vmov r4, r2, d7 -; CHECK-NEXT: adds.w lr, r12, r4 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vpsel q4, q4, q2 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov r3, r4, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r4, r4, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, lr, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d1 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <8 x i16> %b, zeroinitializer %xx = zext <8 x i16> %x to <8 x i64> @@ -1747,105 +1761,101 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr -; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.u16 r3, q1[0] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.u16 r3, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 -; CHECK-NEXT: vmov.s16 r2, q0[0] -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: and r3, lr, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r12 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r12 -; CHECK-NEXT: vmov.s16 r3, q0[1] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.u16 r2, q4[2] +; CHECK-NEXT: vmov.u16 r3, q4[0] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vmov.u16 r3, q4[1] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.s16 r2, q0[1] +; CHECK-NEXT: vmov.s16 r3, q0[0] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vpsel q6, q6, q1 +; CHECK-NEXT: vmov lr, r12, d13 +; CHECK-NEXT: vmov r3, r2, d12 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: vmov.s16 r2, q0[3] +; CHECK-NEXT: vmov.s16 r3, q0[2] +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vpsel q5, q5, q1 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u16 r3, q4[4] +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q4[7] +; CHECK-NEXT: vmov.u16 r3, q4[5] +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r2, r12, d5 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: adds r5, r3, r2 -; CHECK-NEXT: ubfx r2, lr, #8, #1 -; CHECK-NEXT: adc.w r3, r4, r12 -; CHECK-NEXT: ubfx r4, lr, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 -; CHECK-NEXT: vmov.s16 r2, q0[3] -; CHECK-NEXT: vmov.s16 r4, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov.s16 r2, q0[5] +; CHECK-NEXT: vmov.s16 r3, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r2, r4, d4 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u16 r5, q1[6] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u16 r4, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q1[7] -; CHECK-NEXT: vmov.u16 r4, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vpsel q3, q3, q1 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.s16 r2, q0[7] +; CHECK-NEXT: vmov.s16 r3, q0[6] ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r2, r5, #1 -; CHECK-NEXT: ubfx r4, r5, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 -; CHECK-NEXT: vmov.s16 r2, q0[5] -; CHECK-NEXT: vmov.s16 r4, q0[4] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r2 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r2, r4, d2 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d3 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: ubfx r3, r5, #12, #1 -; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-NEXT: vmov.s16 r3, q0[7] -; CHECK-NEXT: vmov.s16 r5, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, r5, d0 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r2, r5 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <8 x i16> %b, zeroinitializer %xx = sext <8 x i16> %x to <8 x i64> @@ -1858,23 +1868,34 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffff +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, r12, d0 -; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: orr.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: bx lr +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer %xx = zext <2 x i16> %x to <2 x i64> @@ -1890,16 +1911,23 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q2, #0xffff +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 @@ -1907,7 +1935,7 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -2077,115 +2105,332 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q5, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.u8 r2, q5[0] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q5[1] +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.u8 r2, q5[2] +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.u8 r2, q5[3] +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.u8 r2, q5[4] +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov.u8 r2, q5[5] +; CHECK-NEXT: vmov.16 q3[5], r2 +; CHECK-NEXT: vmov.u8 r2, q5[6] +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov.u8 r2, q5[7] +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vpsel q6, q1, q0 +; CHECK-NEXT: vmov.u16 r2, q6[2] +; CHECK-NEXT: vmov.u16 r3, q6[0] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.u16 r3, q6[1] +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vpsel q7, q1, q0 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[1] +; CHECK-NEXT: vmov.u8 r3, q2[0] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.i64 q3, #0xff +; CHECK-NEXT: vand q0, q4, q3 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: orr.w lr, lr, r3 +; CHECK-NEXT: add r12, r2 +; CHECK-NEXT: vmov r3, r2, d15 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.u8 r2, q2[3] +; CHECK-NEXT: vmov.u8 r3, q2[2] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u16 r2, q6[6] +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov.u16 r3, q6[4] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q6[7] +; CHECK-NEXT: vmov.u16 r3, q6[5] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[5] +; CHECK-NEXT: vmov.u8 r3, q2[4] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[7] +; CHECK-NEXT: vmov.u8 r3, q2[6] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q5[8] +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.u8 r2, q5[9] +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov.u8 r2, q5[10] +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.u8 r2, q5[11] +; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: vmov.u8 r2, q5[12] +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.u8 r2, q5[13] +; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.u8 r2, q5[14] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u8 r2, q5[15] +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vpsel q5, q1, q7 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.u16 r3, q5[0] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[9] +; CHECK-NEXT: vmov.u8 r3, q2[8] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[11] +; CHECK-NEXT: vmov.u8 r3, q2[10] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q1, q1, q7 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[13] +; CHECK-NEXT: vmov.u8 r3, q2[12] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q2[15] +; CHECK-NEXT: vmov.u8 r3, q2[14] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r7, pc} +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = zext <16 x i8> %x to <16 x i64> + %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) + %r = add i64 %z, %a + ret i64 %r +} + +define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b, i64 %a) { +; CHECK-LABEL: add_v16i8_v16i64_acc_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vcmp.i8 eq, q1, zr +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vpsel q4, q2, q1 ; CHECK-NEXT: vmov.u8 r2, q4[0] -; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[1] -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.16 q3[1], r2 ; CHECK-NEXT: vmov.u8 r2, q4[2] -; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.16 q3[2], r2 ; CHECK-NEXT: vmov.u8 r2, q4[3] -; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.16 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q4[4] -; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.16 q3[4], r2 ; CHECK-NEXT: vmov.u8 r2, q4[5] -; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.16 q3[5], r2 ; CHECK-NEXT: vmov.u8 r2, q4[6] -; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.16 q3[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[7] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vcmp.i16 ne, q1, zr -; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vpsel q5, q2, q1 ; CHECK-NEXT: vmov.u16 r2, q5[2] ; CHECK-NEXT: vmov.u16 r3, q5[0] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[3] ; CHECK-NEXT: vmov.u16 r3, q5[1] -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q1, zr -; CHECK-NEXT: vmov.i64 q1, #0xff -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r12 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r12 -; CHECK-NEXT: vmov.u8 r12, q0[1] -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmov q7[2], q7[0], r3, r12 -; CHECK-NEXT: vand q7, q7, q1 -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r12, lr, d13 -; CHECK-NEXT: vmov r3, r4, d12 -; CHECK-NEXT: orr.w lr, lr, r4 -; CHECK-NEXT: ubfx r4, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: add r3, r12 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov.u8 r4, q0[2] -; CHECK-NEXT: vmov q7[2], q7[0], r4, r2 -; CHECK-NEXT: vand q7, q7, q1 -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r2, r4, d12 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: adc.w r3, lr, r4 -; CHECK-NEXT: vmov r4, r2, d13 -; CHECK-NEXT: adds.w lr, r12, r4 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov.u16 r3, q5[6] -; CHECK-NEXT: vmov.u16 r2, q5[4] +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vpsel q6, q2, q1 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[1] +; CHECK-NEXT: vmov.s8 r3, q0[0] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov q7[3], q7[1], r3, r2 +; CHECK-NEXT: vpsel q7, q7, q3 +; CHECK-NEXT: vmov lr, r12, d15 +; CHECK-NEXT: vmov r3, r2, d14 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[3] +; CHECK-NEXT: vmov.s8 r3, q0[2] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vpsel q6, q6, q3 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov q6[2], q6[0], r4, r3 -; CHECK-NEXT: vand q6, q6, q1 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, r4, d10 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov r3, r4, d11 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r4, r4, r12 +; CHECK-NEXT: vpsel q5, q2, q1 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[5] +; CHECK-NEXT: vmov.s8 r3, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vpsel q6, q6, q3 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 ; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 -; CHECK-NEXT: vand q6, q6, q1 -; CHECK-NEXT: vand q5, q6, q5 +; CHECK-NEXT: vmov.s8 r2, q0[7] +; CHECK-NEXT: vmov.s8 r3, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vpsel q5, q5, q3 ; CHECK-NEXT: vmov r2, r3, d10 -; CHECK-NEXT: adds.w r12, lr, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d11 -; CHECK-NEXT: adds.w lr, r12, r4 -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -2202,133 +2447,8 @@ ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: adc.w r12, r12, r3 ; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.u16 r3, q2[0] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.u16 r3, q2[1] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 -; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, r4, d6 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov r3, r4, d7 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r4, r4, r12 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 -; CHECK-NEXT: vand q4, q4, q1 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: adds.w r12, lr, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d7 -; CHECK-NEXT: adds.w lr, r12, r4 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov r3, r4, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r4, r4, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, lr, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d1 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} -entry: - %c = icmp eq <16 x i8> %b, zeroinitializer - %xx = zext <16 x i8> %x to <16 x i64> - %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer - %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) - %r = add i64 %z, %a - ret i64 %r -} - -define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b, i64 %a) { -; CHECK-LABEL: add_v16i8_v16i64_acc_sext: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q1, #0x0 -; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vpsel q3, q2, q1 -; CHECK-NEXT: vmov.u8 r2, q3[0] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u8 r2, q3[1] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u8 r2, q3[2] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q3[3] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u8 r2, q3[4] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u8 r2, q3[5] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.u8 r2, q3[6] -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.u8 r2, q3[7] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q4, q2, q1 ; CHECK-NEXT: vmov.u16 r2, q4[2] ; CHECK-NEXT: vmov.u16 r3, q4[0] @@ -2336,204 +2456,88 @@ ; CHECK-NEXT: vmov.u16 r2, q4[3] ; CHECK-NEXT: vmov.u16 r3, q4[1] ; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 -; CHECK-NEXT: vmov.s8 r2, q0[0] ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: and r3, lr, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r3, r12 -; CHECK-NEXT: vmov q5[3], q5[1], r3, r12 -; CHECK-NEXT: vmov.s8 r3, q0[1] +; CHECK-NEXT: vpsel q5, q2, q1 +; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r2, r12, d11 -; CHECK-NEXT: vmov r3, r4, d10 -; CHECK-NEXT: adds r5, r3, r2 -; CHECK-NEXT: ubfx r2, lr, #8, #1 -; CHECK-NEXT: adc.w r3, r4, r12 -; CHECK-NEXT: ubfx r4, lr, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r4 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r4 -; CHECK-NEXT: vmov.s8 r2, q0[3] -; CHECK-NEXT: vmov.s8 r4, q0[2] -; CHECK-NEXT: vmov q6[2], q6[0], r4, r2 +; CHECK-NEXT: vmov.s8 r2, q0[9] +; CHECK-NEXT: vmov.s8 r3, q0[8] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r2 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r2, r4, d10 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r5, r4, d11 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u16 r5, q4[6] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u16 r4, q4[4] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q4[7] -; CHECK-NEXT: vmov.u16 r4, q4[5] -; CHECK-NEXT: vmov q5[3], q5[1], r4, r5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vpsel q6, q6, q3 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[11] +; CHECK-NEXT: vmov.s8 r3, q0[10] ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r2, r5, #1 -; CHECK-NEXT: ubfx r4, r5, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r4 -; CHECK-NEXT: vmov.s8 r2, q0[5] -; CHECK-NEXT: vmov.s8 r4, q0[4] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r2 +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r2 -; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r2, r4, d8 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d9 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: ubfx r3, r5, #12, #1 -; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r3 -; CHECK-NEXT: vmov.s8 r3, q0[7] -; CHECK-NEXT: vmov.s8 r5, q0[6] -; CHECK-NEXT: vmov q5[2], q5[0], r5, r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r5, r3 -; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r3, r5, d8 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r5, r2 -; CHECK-NEXT: vmov r2, r4, d9 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: adc.w r3, r5, r4 -; CHECK-NEXT: vmov.u8 r5, q3[8] -; CHECK-NEXT: vmov.16 q4[0], r5 -; CHECK-NEXT: vmov.u8 r5, q3[9] -; CHECK-NEXT: vmov.16 q4[1], r5 -; CHECK-NEXT: vmov.u8 r5, q3[10] -; CHECK-NEXT: vmov.16 q4[2], r5 -; CHECK-NEXT: vmov.u8 r5, q3[11] -; CHECK-NEXT: vmov.16 q4[3], r5 -; CHECK-NEXT: vmov.u8 r5, q3[12] -; CHECK-NEXT: vmov.16 q4[4], r5 -; CHECK-NEXT: vmov.u8 r5, q3[13] -; CHECK-NEXT: vmov.16 q4[5], r5 -; CHECK-NEXT: vmov.u8 r5, q3[14] -; CHECK-NEXT: vmov.16 q4[6], r5 -; CHECK-NEXT: vmov.u8 r5, q3[15] -; CHECK-NEXT: vmov.16 q4[7], r5 -; CHECK-NEXT: vcmp.i16 ne, q4, zr +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vpsel q5, q5, q3 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u16 r3, q4[4] +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q4[7] +; CHECK-NEXT: vmov.u16 r3, q4[5] +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q5, zr ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u16 r5, q1[2] -; CHECK-NEXT: vmov.u16 r4, q1[0] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q1[3] -; CHECK-NEXT: vmov.u16 r4, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[13] +; CHECK-NEXT: vmov.s8 r3, q0[12] ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r2, r5, #1 -; CHECK-NEXT: ubfx r4, r5, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 -; CHECK-NEXT: vmov.s8 r2, q0[9] -; CHECK-NEXT: vmov.s8 r4, q0[8] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r2, r4, d4 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d5 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: ubfx r3, r5, #12, #1 -; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r5, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 -; CHECK-NEXT: vmov.s8 r3, q0[11] -; CHECK-NEXT: vmov.s8 r5, q0[10] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r5, r3 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, r5, d4 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r5, r2 -; CHECK-NEXT: vmov r2, r4, d5 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: adc.w r3, r5, r4 -; CHECK-NEXT: vmov.u16 r5, q1[6] -; CHECK-NEXT: vmov.u16 r4, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q1[7] -; CHECK-NEXT: vmov.u16 r4, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r2, r5, #1 -; CHECK-NEXT: ubfx r4, r5, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 -; CHECK-NEXT: vmov.s8 r2, q0[13] -; CHECK-NEXT: vmov.s8 r4, q0[12] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[15] +; CHECK-NEXT: vmov.s8 r3, q0[14] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r2 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r2, r4, d2 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, r2, d3 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: ubfx r3, r5, #12, #1 -; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-NEXT: vmov.s8 r3, q0[15] -; CHECK-NEXT: vmov.s8 r5, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, r5, d0 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r2, r5 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> @@ -2546,23 +2550,34 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xff +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, r12, d0 -; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: orr.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: bx lr +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer %xx = zext <2 x i8> %x to <2 x i64> @@ -2578,16 +2593,23 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 @@ -2595,7 +2617,7 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -2617,15 +2639,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: csetm r12, eq +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r12, r2, #0, #8 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r12, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r12 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -50,16 +50,23 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmullb.u32 q3, q0, q1 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q3, q0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q3, q0 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -78,16 +85,23 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) { ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmullb.s32 q3, q0, q1 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q3, q0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q3, q0 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -356,16 +370,23 @@ ; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: umull r2, r3, r3, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -385,28 +406,35 @@ ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xffff -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -801,325 +829,313 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q5[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q5[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q5[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q5[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q5[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q5[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q5[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vmov.u8 r3, q4[0] -; CHECK-NEXT: vpsel q6, q3, q0 -; CHECK-NEXT: vmov.u16 r0, q6[2] -; CHECK-NEXT: vmov.u16 r1, q6[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov.u16 r1, q6[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vpsel q6, q2, q0 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.u8 r0, q6[0] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q6[1] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q6[2] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q6[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q6[4] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q6[5] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q6[6] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q6[7] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vmov.u8 r2, q3[0] +; CHECK-NEXT: vpsel q7, q2, q4 +; CHECK-NEXT: vmov.u16 r0, q7[2] +; CHECK-NEXT: vmov.u16 r1, q7[0] +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q7[3] +; CHECK-NEXT: vmov.u16 r1, q7[1] +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q2, q4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 +; CHECK-NEXT: vmov.u8 r1, q3[1] ; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 -; CHECK-NEXT: vmov q7[3], q7[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov.u8 r2, q1[0] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov.u8 r2, q4[1] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r1, r12, r2, r1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: umull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov r1, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w lr, r3, r1 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: vmov.u8 r1, q4[2] -; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov q7[3], q7[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 +; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r1 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov.u8 r0, q3[2] +; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov.u8 r3, q4[3] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r3 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q3[3] +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov q7, q4 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q5 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r0, r0, lr -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov.u16 r2, q6[6] -; CHECK-NEXT: vmov.u16 r3, q6[4] -; CHECK-NEXT: vmov.u8 r1, q4[4] +; CHECK-NEXT: vmov.u16 r2, q7[6] +; CHECK-NEXT: vmov.u16 r3, q7[4] +; CHECK-NEXT: vmov.u8 r0, q3[4] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q6[7] -; CHECK-NEXT: vmov.u16 r3, q6[5] +; CHECK-NEXT: vmov.u16 r2, q7[7] +; CHECK-NEXT: vmov.u16 r3, q7[5] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q1[5] ; CHECK-NEXT: vmov.u8 r3, q1[4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov.u8 r3, q4[5] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r3 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r1 -; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r3, r4, d1 +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q3[5] +; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vand q7, q7, q2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: adcs r1, r4 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.u8 r0, q3[6] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: vmov.u8 r3, q1[6] -; CHECK-NEXT: vmov.u8 r4, q4[6] +; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q4[7] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 +; CHECK-NEXT: vmov.u8 r3, q3[7] +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r0, r2, d0 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u8 r2, q5[8] -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmov.u8 r2, q5[9] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u8 r2, q5[10] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.u8 r2, q5[11] -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov.u8 r2, q5[12] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u8 r2, q5[13] -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov.u8 r2, q5[14] -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vmov.u8 r2, q5[15] -; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov.u8 r2, q6[8] ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vmov.u8 r0, q7[8] -; CHECK-NEXT: vpsel q3, q3, q0 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.u16 r3, q3[0] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q6[9] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.u8 r2, q6[10] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u8 r2, q6[11] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u8 r2, q6[12] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.u8 r2, q6[13] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.u8 r2, q6[14] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.u8 r2, q6[15] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.u8 r0, q3[8] +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vpsel q6, q7, q4 +; CHECK-NEXT: vmov.u16 r2, q6[2] +; CHECK-NEXT: vmov.u16 r3, q6[0] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.u16 r3, q3[1] +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.u16 r3, q6[1] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[9] -; CHECK-NEXT: vmov.u8 r4, q1[8] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q7[9] -; CHECK-NEXT: vmov q5[2], q5[0], r0, r4 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: umull r1, r4, r1, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r3, r4, d1 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: vpsel q0, q7, q4 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 -; CHECK-NEXT: adcs r1, r4 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: vmov.u8 r3, q1[8] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q3[9] +; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vand q7, q7, q2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.u8 r0, q3[10] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[11] ; CHECK-NEXT: vmov.u8 r3, q1[10] -; CHECK-NEXT: vmov.u8 r4, q7[10] +; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q7[11] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 +; CHECK-NEXT: vmov.u8 r3, q3[11] +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r0, r2, d0 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u16 r2, q3[6] ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov.u16 r3, q3[4] +; CHECK-NEXT: vmov.u16 r2, q6[6] +; CHECK-NEXT: vmov.u16 r3, q6[4] +; CHECK-NEXT: vmov.u8 r0, q3[12] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.u16 r3, q3[5] -; CHECK-NEXT: vmov.u8 r0, q7[12] +; CHECK-NEXT: vmov.u16 r2, q6[7] +; CHECK-NEXT: vmov.u16 r3, q6[5] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[13] -; CHECK-NEXT: vmov.u8 r4, q1[12] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q7[13] -; CHECK-NEXT: vmov q4[2], q4[0], r0, r4 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: vmov.u8 r3, q1[12] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q3[13] +; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: umull r1, r4, r1, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r3, r4, d1 +; CHECK-NEXT: vand q6, q6, q2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: adcs r1, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.u8 r0, q3[14] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[15] ; CHECK-NEXT: vmov.u8 r3, q1[14] -; CHECK-NEXT: vmov.u8 r4, q7[14] +; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q7[15] -; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 +; CHECK-NEXT: vmov.u8 r3, q3[15] +; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r0, r2, d0 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -1133,250 +1149,241 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vmov.s8 r3, q1[0] -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov.s8 r4, q0[4] -; CHECK-NEXT: vmov.u8 r0, q4[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q4[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q4[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q4[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q4[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q4[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q4[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: vpsel q5, q3, q2 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.u16 r1, q5[0] -; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.u16 r1, q5[1] -; CHECK-NEXT: vmov q6[3], q6[1], r1, r0 -; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: ubfx r1, r0, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 -; CHECK-NEXT: vmov.s8 r1, q1[1] -; CHECK-NEXT: vmov.s8 r2, q0[1] -; CHECK-NEXT: smull r1, r12, r2, r1 -; CHECK-NEXT: vmov.s8 r2, q0[0] -; CHECK-NEXT: smull r2, r3, r2, r3 -; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 -; CHECK-NEXT: vmov q7[3], q7[1], r3, r12 -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r1, r12, d13 -; CHECK-NEXT: vmov r3, r2, d12 -; CHECK-NEXT: adds.w lr, r3, r1 -; CHECK-NEXT: ubfx r3, r0, #12, #1 -; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: vmov.s8 r1, q1[2] -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 -; CHECK-NEXT: vmov.s8 r2, q0[2] -; CHECK-NEXT: vmov.s8 r0, q1[3] -; CHECK-NEXT: vmov.s8 r3, q0[3] -; CHECK-NEXT: smull r0, r3, r3, r0 -; CHECK-NEXT: smull r1, r2, r2, r1 -; CHECK-NEXT: vmov q7[2], q7[0], r1, r0 -; CHECK-NEXT: vmov q7[3], q7[1], r2, r3 -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r0, r1, d12 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: adds.w r0, r0, lr -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: vmov.s8 r1, q1[4] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q5[7] -; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: smull r1, r4, r4, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r0, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 -; CHECK-NEXT: vmov.s8 r0, q1[5] -; CHECK-NEXT: vmov.s8 r3, q0[5] -; CHECK-NEXT: smull r0, r3, r3, r0 -; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r3 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r0, r1, d10 -; CHECK-NEXT: vmov r3, r4, d11 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: adcs r1, r4 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 -; CHECK-NEXT: vmov.s8 r2, q1[7] -; CHECK-NEXT: vmov.s8 r3, q0[7] -; CHECK-NEXT: vmov.s8 r4, q1[6] -; CHECK-NEXT: vmov.s8 r0, q0[6] +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vpsel q5, q2, q0 +; CHECK-NEXT: vmov.s8 r2, q1[0] +; CHECK-NEXT: vmov.u8 r0, q5[0] +; CHECK-NEXT: vmov.s8 r3, q3[0] +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q5[1] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q5[2] +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q5[3] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q5[4] +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u8 r0, q5[5] +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q5[6] +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q5[7] +; CHECK-NEXT: vmov.16 q4[7], r0 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r4, r0, r4 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r2 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r3 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r0, r2, d10 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, r3, d11 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u8 r2, q4[8] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u8 r2, q4[9] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u8 r2, q4[10] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u8 r2, q4[11] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u8 r2, q4[12] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u8 r2, q4[13] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u8 r2, q4[14] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u8 r2, q4[15] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: vmov.s8 r0, q1[8] -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.s8 r1, q0[8] -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.u16 r3, q2[0] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.u16 r3, q2[1] +; CHECK-NEXT: vcmp.i16 ne, q4, zr +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vpsel q6, q2, q0 +; CHECK-NEXT: vmov.u16 r0, q6[2] +; CHECK-NEXT: vmov.u16 r1, q6[0] +; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q6[3] +; CHECK-NEXT: vmov.u16 r1, q6[1] +; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vpsel q7, q2, q0 +; CHECK-NEXT: vmov r0, r1, d14 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r1 +; CHECK-NEXT: vmov q4[3], q4[1], r0, r1 +; CHECK-NEXT: vmov.s8 r0, q1[1] +; CHECK-NEXT: vmov.s8 r1, q3[1] +; CHECK-NEXT: vcmp.i32 ne, q4, zr ; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.s8 r3, q1[9] -; CHECK-NEXT: vmov.s8 r4, q0[9] -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r4 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov r3, r4, d7 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: adcs r1, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vmov.s8 r2, q1[11] -; CHECK-NEXT: vmov.s8 r3, q0[11] -; CHECK-NEXT: vmov.s8 r4, q1[10] -; CHECK-NEXT: vmov.s8 r0, q0[10] +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov.s8 r0, q1[2] +; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: vmov r2, r3, d15 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.s8 r1, q3[2] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q1[3] +; CHECK-NEXT: vmov.s8 r3, q3[3] +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r4, r0, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r0, r2 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r3 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r0, r2, d6 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov.u16 r2, q6[6] +; CHECK-NEXT: vmov.u16 r3, q6[4] +; CHECK-NEXT: vmov.s8 r0, q1[4] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q6[7] +; CHECK-NEXT: vmov.u16 r3, q6[5] +; CHECK-NEXT: vmov.s8 r1, q3[4] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q1[5] +; CHECK-NEXT: vmov.s8 r3, q3[5] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.s8 r0, q1[6] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.s8 r1, q3[6] +; CHECK-NEXT: vmov.s8 r2, q1[7] +; CHECK-NEXT: vmov.s8 r3, q3[7] +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q5[8] +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.u8 r2, q5[9] +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov.u8 r2, q5[10] +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.u8 r2, q5[11] +; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: vmov.u8 r2, q5[12] +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.u8 r2, q5[13] +; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.u8 r2, q5[14] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u8 r2, q5[15] +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov.s8 r0, q1[8] +; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vmov.s8 r1, q3[8] +; CHECK-NEXT: vpsel q5, q2, q7 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.u16 r3, q5[0] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q1[9] +; CHECK-NEXT: vmov.s8 r3, q3[9] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.u16 r3, q2[5] +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.s8 r0, q1[10] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.s8 r1, q3[10] +; CHECK-NEXT: vmov.s8 r2, q1[11] +; CHECK-NEXT: vmov.s8 r3, q3[11] +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov.s8 r0, q1[12] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov.s8 r1, q0[12] -; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov.s8 r1, q3[12] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 -; CHECK-NEXT: vmov.s8 r3, q1[13] -; CHECK-NEXT: vmov.s8 r4, q0[13] -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r4 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r3, r4, d5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q2, q2, q7 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.s8 r2, q1[13] +; CHECK-NEXT: vmov.s8 r3, q3[13] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: adcs r1, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.s8 r0, q1[14] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov.s8 r1, q3[14] ; CHECK-NEXT: vmov.s8 r2, q1[15] -; CHECK-NEXT: vmov.s8 r3, q0[15] -; CHECK-NEXT: vmov.s8 r4, q1[14] -; CHECK-NEXT: vmov.s8 r0, q0[14] +; CHECK-NEXT: vmov.s8 r3, q3[15] ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r4, r0, r4 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, r2, d0 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> @@ -1565,16 +1572,23 @@ ; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: umull r2, r3, r3, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -1594,28 +1608,35 @@ ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, eq +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -1636,27 +1657,34 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov r2, lr, d1 ; CHECK-NEXT: vmov r4, r9, d2 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r2, d4 -; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: umull r3, r5, r6, r4 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, lr, d1 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, r12, d3 ; CHECK-NEXT: umull r1, r8, r2, r0 +; CHECK-NEXT: umull r3, r5, r6, r4 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 ; CHECK-NEXT: mla r1, r2, r12, r8 ; CHECK-NEXT: mla r0, lr, r0, r1 ; CHECK-NEXT: mla r1, r6, r9, r5 ; CHECK-NEXT: mla r1, r7, r4, r1 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d5 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 @@ -1724,16 +1752,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmullb.u32 q3, q0, q1 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q3, q0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q3, q0 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -1757,16 +1792,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmullb.s32 q3, q0, q1 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q3, q0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q3, q0 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -1978,16 +2020,23 @@ ; CHECK-NEXT: vand q1, q2, q3 ; CHECK-NEXT: umull r2, r3, r2, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -2012,28 +2061,35 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q3, #0xffff +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: smull lr, r12, r3, r2 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: smull r2, r3, r2, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -2280,327 +2336,315 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov.u8 r2, q5[0] -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u8 r2, q5[1] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q5[2] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q5[3] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q5[4] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u8 r2, q5[5] -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov.u8 r2, q5[6] -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov.u8 r2, q5[7] -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vpsel q6, q3, q0 -; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.u16 r3, q6[0] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q6[3] -; CHECK-NEXT: vmov.u16 r3, q6[1] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r2, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q7[2], q7[0], r2, r3 -; CHECK-NEXT: vmov q7[3], q7[1], r2, r3 +; CHECK-NEXT: vpsel q6, q2, q0 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.u8 r2, q6[0] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q6[1] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.u8 r2, q6[2] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u8 r2, q6[3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u8 r2, q6[4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.u8 r2, q6[5] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.u8 r2, q6[6] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.u8 r2, q6[7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vmov.u8 r4, q3[2] +; CHECK-NEXT: vpsel q7, q2, q4 +; CHECK-NEXT: vmov.u16 r2, q7[2] +; CHECK-NEXT: vmov.u16 r3, q7[0] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q7[3] +; CHECK-NEXT: vmov.u16 r3, q7[1] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q2, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[1] ; CHECK-NEXT: vmov.u8 r3, q1[0] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q4[1] -; CHECK-NEXT: vmov.u8 r2, q4[0] -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: umull r2, r12, r2, r12 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r12 -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: ubfx r2, lr, #12, #1 -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: ubfx r4, lr, #8, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q7[2], q7[0], r4, r2 -; CHECK-NEXT: vmov.u8 r3, q4[2] -; CHECK-NEXT: vmov q7[3], q7[1], r4, r2 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q3[1] +; CHECK-NEXT: vmov.u8 r2, q3[0] +; CHECK-NEXT: vmov.i64 q2, #0xff +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: umull lr, r12, r2, r12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: umull r2, r3, r2, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r2, lr +; CHECK-NEXT: vmov q4[3], q4[1], r3, r12 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov lr, r12, d9 +; CHECK-NEXT: vmov r3, r2, d8 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: vmov.u8 r4, q1[2] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: vmov.u8 r4, q4[3] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r4 +; CHECK-NEXT: vmov.u8 r3, q1[2] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q3[3] +; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r5, s12 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r5, r4, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q4[4] -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov q7, q4 +; CHECK-NEXT: umull r4, r5, r5, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vpsel q0, q0, q5 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds.w r2, r2, lr ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds.w r12, r2, r6 -; CHECK-NEXT: vmov.u16 r2, q6[6] -; CHECK-NEXT: vmov.u16 r6, q6[4] -; CHECK-NEXT: adc.w lr, r3, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov.u16 r2, q6[7] -; CHECK-NEXT: vmov.u16 r6, q6[5] -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u16 r5, q7[6] +; CHECK-NEXT: vmov.u16 r4, q7[4] +; CHECK-NEXT: vmov.u8 r2, q3[4] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q7[7] +; CHECK-NEXT: vmov.u16 r4, q7[5] +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r5, r2, #1 -; CHECK-NEXT: ubfx r6, r2, #4, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r5, r6 -; CHECK-NEXT: vmov q6[3], q6[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[5] -; CHECK-NEXT: vmov.u8 r5, q1[4] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q4[5] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r5, r4, d0 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 +; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q1[5] +; CHECK-NEXT: vmov.u8 r4, q1[4] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q3[5] +; CHECK-NEXT: vmov q7[2], q7[0], r2, r4 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vand q7, q7, q2 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov r3, s28 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov.u8 r2, q3[6] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q1[7] +; CHECK-NEXT: vmov.u8 r4, q1[6] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q3[7] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r6, lr, r4 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q4[6] -; CHECK-NEXT: adc.w r12, r6, r4 -; CHECK-NEXT: ubfx r6, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r6 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r6 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.u8 r6, q1[6] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov.u8 r6, q4[7] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vmov.u8 r4, q7[8] -; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r6 -; CHECK-NEXT: vmov r6, r5, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: adds.w r12, r2, r6 -; CHECK-NEXT: vmov.u8 r2, q5[8] -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmov.u8 r2, q5[9] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u8 r2, q5[10] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.u8 r2, q5[11] -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov.u8 r2, q5[12] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u8 r2, q5[13] -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov.u8 r2, q5[14] -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vmov.u8 r2, q5[15] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: adc.w lr, r3, r5 -; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vpsel q3, q3, q0 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.u16 r6, q3[0] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.u16 r6, q3[1] -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q6[8] +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: vmov.u8 r5, q6[9] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.u8 r5, q6[10] +; CHECK-NEXT: vmov.16 q0[2], r5 +; CHECK-NEXT: vmov.u8 r5, q6[11] +; CHECK-NEXT: vmov.16 q0[3], r5 +; CHECK-NEXT: vmov.u8 r5, q6[12] +; CHECK-NEXT: vmov.16 q0[4], r5 +; CHECK-NEXT: vmov.u8 r5, q6[13] +; CHECK-NEXT: vmov.16 q0[5], r5 +; CHECK-NEXT: vmov.u8 r5, q6[14] +; CHECK-NEXT: vmov.16 q0[6], r5 +; CHECK-NEXT: vmov.u8 r5, q6[15] +; CHECK-NEXT: vmov.16 q0[7], r5 +; CHECK-NEXT: vmov.u8 r2, q3[8] +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vpsel q6, q7, q4 +; CHECK-NEXT: vmov.u16 r5, q6[2] +; CHECK-NEXT: vmov.u16 r4, q6[0] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q6[3] +; CHECK-NEXT: vmov.u16 r4, q6[1] +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q7, q4 +; CHECK-NEXT: vmov r5, r4, d0 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 +; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q1[9] +; CHECK-NEXT: vmov.u8 r4, q1[8] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q3[9] +; CHECK-NEXT: vmov q7[2], q7[0], r2, r4 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vand q7, q7, q2 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r3, s28 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov.u8 r2, q3[10] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q1[11] +; CHECK-NEXT: vmov.u8 r4, q1[10] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r5, r2, #1 -; CHECK-NEXT: ubfx r6, r2, #4, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r6 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[9] -; CHECK-NEXT: vmov.u8 r5, q1[8] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q7[9] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q3[11] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov r5, s22 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r6, lr, r4 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q7[10] -; CHECK-NEXT: adc.w r12, r6, r4 -; CHECK-NEXT: ubfx r6, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r6 -; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.u8 r6, q1[10] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov.u8 r6, q7[11] -; CHECK-NEXT: vmov q5[2], q5[0], r5, r6 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r6, s22 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vmov.u8 r4, q7[12] -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r6 -; CHECK-NEXT: vmov r6, r5, d1 -; CHECK-NEXT: adds.w r12, r2, r6 -; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.u16 r6, q3[4] -; CHECK-NEXT: adc.w lr, r3, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.u16 r6, q3[5] -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u16 r5, q6[6] +; CHECK-NEXT: vmov.u16 r4, q6[4] +; CHECK-NEXT: vmov.u8 r2, q3[12] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q6[7] +; CHECK-NEXT: vmov.u16 r4, q6[5] +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r5, r2, #1 -; CHECK-NEXT: ubfx r6, r2, #4, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[13] -; CHECK-NEXT: vmov.u8 r5, q1[12] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q7[13] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r5, r4, d0 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 +; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q1[13] +; CHECK-NEXT: vmov.u8 r4, q1[12] +; CHECK-NEXT: vcmp.i32 ne, q4, zr ; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov.u8 r4, q3[13] +; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vand q6, q6, q2 ; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r6, lr, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 +; CHECK-NEXT: vpsel q4, q4, q5 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 ; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q7[14] -; CHECK-NEXT: adc.w r12, r6, r4 -; CHECK-NEXT: ubfx r6, r2, #12, #1 -; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r6 -; CHECK-NEXT: vmov.u8 r2, q1[15] -; CHECK-NEXT: vmov.u8 r6, q1[14] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov.u8 r6, q7[15] -; CHECK-NEXT: vmov q1[2], q1[0], r5, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov.u8 r2, q3[14] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q1[15] +; CHECK-NEXT: vmov.u8 r4, q1[14] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q3[15] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r6 -; CHECK-NEXT: vmov r6, r5, d1 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vpsel q0, q0, q5 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -2615,252 +2659,243 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vmov.s8 r4, q0[0] -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov.s8 r5, q0[2] -; CHECK-NEXT: vmov.u8 r2, q4[0] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u8 r2, q4[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u8 r2, q4[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u8 r2, q4[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u8 r2, q4[4] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u8 r2, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u8 r2, q4[6] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u8 r2, q4[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: vpsel q5, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.u16 r3, q5[0] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q5[3] -; CHECK-NEXT: vmov.u16 r3, q5[1] -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r12, p0 -; CHECK-NEXT: and r2, r12, #1 -; CHECK-NEXT: ubfx r3, r12, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vpsel q5, q2, q0 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.u8 r2, q5[0] +; CHECK-NEXT: vmov.s8 r4, q1[2] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q5[1] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u8 r2, q5[2] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u8 r2, q5[3] +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.u8 r2, q5[4] +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vmov.u8 r2, q5[5] +; CHECK-NEXT: vmov.16 q4[5], r2 +; CHECK-NEXT: vmov.u8 r2, q5[6] +; CHECK-NEXT: vmov.16 q4[6], r2 +; CHECK-NEXT: vmov.u8 r2, q5[7] +; CHECK-NEXT: vmov.16 q4[7], r2 +; CHECK-NEXT: vmov.s8 r5, q3[2] +; CHECK-NEXT: vcmp.i16 ne, q4, zr +; CHECK-NEXT: smull r4, r5, r5, r4 +; CHECK-NEXT: vpsel q6, q2, q0 +; CHECK-NEXT: vmov.u16 r2, q6[2] +; CHECK-NEXT: vmov.u16 r3, q6[0] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.u16 r3, q6[1] +; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vpsel q7, q2, q0 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[1] -; CHECK-NEXT: vmov.s8 r3, q0[1] -; CHECK-NEXT: smull r2, lr, r3, r2 +; CHECK-NEXT: vmov.s8 r3, q3[1] +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: smull lr, r12, r3, r2 ; CHECK-NEXT: vmov.s8 r3, q1[0] -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 -; CHECK-NEXT: vmov q7[3], q7[1], r4, lr -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r2, lr, d13 -; CHECK-NEXT: vmov r4, r3, d12 -; CHECK-NEXT: adds r6, r4, r2 -; CHECK-NEXT: ubfx r4, r12, #12, #1 -; CHECK-NEXT: ubfx r2, r12, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 -; CHECK-NEXT: vmov.s8 r3, q1[2] -; CHECK-NEXT: vmov q6[3], q6[1], r2, r4 +; CHECK-NEXT: vmov.s8 r2, q3[0] +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: smull r2, r3, r2, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r3, d15 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[3] -; CHECK-NEXT: vmov.s8 r4, q0[3] -; CHECK-NEXT: smull r3, r5, r5, r3 -; CHECK-NEXT: smull r2, r4, r4, r2 -; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 -; CHECK-NEXT: vmov q7[3], q7[1], r5, r4 -; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: vmov r6, r5, d13 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r6 -; CHECK-NEXT: vmov.u16 r6, q5[6] -; CHECK-NEXT: adc.w lr, r3, r5 -; CHECK-NEXT: vmov.u16 r5, q5[4] -; CHECK-NEXT: vmov q6[2], q6[0], r5, r6 -; CHECK-NEXT: vmov.u16 r6, q5[7] -; CHECK-NEXT: vmov.u16 r5, q5[5] +; CHECK-NEXT: vmov.s8 r3, q3[3] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u16 r5, q6[6] +; CHECK-NEXT: vmov.u16 r4, q6[4] ; CHECK-NEXT: vmov.s8 r2, q1[4] -; CHECK-NEXT: vmov q6[3], q6[1], r5, r6 -; CHECK-NEXT: vmov.s8 r3, q0[4] -; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q6[7] +; CHECK-NEXT: vmov.u16 r4, q6[5] +; CHECK-NEXT: vmov.s8 r3, q3[4] +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r4, r6, #1 -; CHECK-NEXT: ubfx r5, r6, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r5 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov r5, r4, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 ; CHECK-NEXT: vmov.s8 r5, q1[5] -; CHECK-NEXT: vmov.s8 r4, q0[5] +; CHECK-NEXT: vmov.s8 r4, q3[5] +; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r5 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r4 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r2, r3, d10 -; CHECK-NEXT: vmov r5, r4, d11 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: adds.w r2, r2, r12 ; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: ubfx r5, r6, #12, #1 -; CHECK-NEXT: ubfx r6, r6, #8, #1 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r6, r5 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov q5[3], q5[1], r6, r5 -; CHECK-NEXT: vmov.s8 r6, q1[7] -; CHECK-NEXT: vmov.s8 r5, q0[7] -; CHECK-NEXT: vmov.s8 r4, q1[6] -; CHECK-NEXT: vmov.s8 r2, q0[6] -; CHECK-NEXT: smull r6, r5, r5, r6 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r6 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r5 -; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r2, r6, d10 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov.s8 r2, q1[6] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov.s8 r3, q3[6] +; CHECK-NEXT: vmov.s8 r5, q1[7] +; CHECK-NEXT: vmov.s8 r4, q3[7] +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r6 -; CHECK-NEXT: vmov r6, r5, d11 -; CHECK-NEXT: adds.w r12, r2, r6 -; CHECK-NEXT: vmov.u8 r6, q4[8] -; CHECK-NEXT: vmov.16 q5[0], r6 -; CHECK-NEXT: vmov.u8 r6, q4[9] -; CHECK-NEXT: vmov.16 q5[1], r6 -; CHECK-NEXT: vmov.u8 r6, q4[10] -; CHECK-NEXT: vmov.16 q5[2], r6 -; CHECK-NEXT: vmov.u8 r6, q4[11] -; CHECK-NEXT: vmov.16 q5[3], r6 -; CHECK-NEXT: vmov.u8 r6, q4[12] -; CHECK-NEXT: vmov.16 q5[4], r6 -; CHECK-NEXT: vmov.u8 r6, q4[13] -; CHECK-NEXT: vmov.16 q5[5], r6 -; CHECK-NEXT: vmov.u8 r6, q4[14] -; CHECK-NEXT: vmov.16 q5[6], r6 -; CHECK-NEXT: vmov.u8 r6, q4[15] -; CHECK-NEXT: vmov.16 q5[7], r6 -; CHECK-NEXT: adc.w lr, r3, r5 -; CHECK-NEXT: vcmp.i16 ne, q5, zr +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q5[8] +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.16 q6[0], r5 +; CHECK-NEXT: vmov.u8 r5, q5[9] +; CHECK-NEXT: vmov.16 q6[1], r5 +; CHECK-NEXT: vmov.u8 r5, q5[10] +; CHECK-NEXT: vmov.16 q6[2], r5 +; CHECK-NEXT: vmov.u8 r5, q5[11] +; CHECK-NEXT: vmov.16 q6[3], r5 +; CHECK-NEXT: vmov.u8 r5, q5[12] +; CHECK-NEXT: vmov.16 q6[4], r5 +; CHECK-NEXT: vmov.u8 r5, q5[13] +; CHECK-NEXT: vmov.16 q6[5], r5 +; CHECK-NEXT: vmov.u8 r5, q5[14] +; CHECK-NEXT: vmov.16 q6[6], r5 +; CHECK-NEXT: vmov.u8 r5, q5[15] +; CHECK-NEXT: vmov.16 q6[7], r5 ; CHECK-NEXT: vmov.s8 r2, q1[8] -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.s8 r3, q0[8] -; CHECK-NEXT: vmov.u16 r6, q2[2] -; CHECK-NEXT: vmov.u16 r5, q2[0] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u16 r6, q2[3] -; CHECK-NEXT: vmov.u16 r5, q2[1] +; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vmov.s8 r3, q3[8] +; CHECK-NEXT: vpsel q5, q2, q7 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r4, r6, #1 -; CHECK-NEXT: ubfx r5, r6, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q5[2] +; CHECK-NEXT: vmov.u16 r4, q5[0] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q5[3] +; CHECK-NEXT: vmov.u16 r4, q5[1] +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov r5, r4, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 ; CHECK-NEXT: vmov.s8 r5, q1[9] -; CHECK-NEXT: vmov.s8 r4, q0[9] +; CHECK-NEXT: vmov.s8 r4, q3[9] +; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r5 -; CHECK-NEXT: vmov q4[3], q4[1], r3, r4 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: adds.w r2, r2, r12 ; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: ubfx r5, r6, #12, #1 -; CHECK-NEXT: ubfx r6, r6, #8, #1 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 -; CHECK-NEXT: vmov.s8 r6, q1[11] -; CHECK-NEXT: vmov.s8 r5, q0[11] -; CHECK-NEXT: vmov.s8 r4, q1[10] -; CHECK-NEXT: vmov.s8 r2, q0[10] -; CHECK-NEXT: smull r6, r5, r5, r6 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 -; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r2, r6, d6 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov.s8 r2, q1[10] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov.s8 r3, q3[10] +; CHECK-NEXT: vmov.s8 r5, q1[11] +; CHECK-NEXT: vmov.s8 r4, q3[11] +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r6 -; CHECK-NEXT: vmov r6, r5, d7 -; CHECK-NEXT: adds.w r12, r2, r6 -; CHECK-NEXT: vmov.u16 r6, q2[6] -; CHECK-NEXT: adc.w lr, r3, r5 -; CHECK-NEXT: vmov.u16 r5, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u16 r6, q2[7] -; CHECK-NEXT: vmov.u16 r5, q2[5] +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u16 r5, q5[6] +; CHECK-NEXT: vmov.u16 r4, q5[4] ; CHECK-NEXT: vmov.s8 r2, q1[12] -; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 -; CHECK-NEXT: vmov.s8 r3, q0[12] -; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q5[7] +; CHECK-NEXT: vmov.u16 r4, q5[5] +; CHECK-NEXT: vmov.s8 r3, q3[12] +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r4, r6, #1 -; CHECK-NEXT: ubfx r5, r6, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q2, q2, q7 +; CHECK-NEXT: vmov r5, r4, d4 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 ; CHECK-NEXT: vmov.s8 r5, q1[13] -; CHECK-NEXT: vmov.s8 r4, q0[13] +; CHECK-NEXT: vmov.s8 r4, q3[13] +; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r5 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r4 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: adds.w r2, r2, r12 ; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: ubfx r5, r6, #12, #1 -; CHECK-NEXT: ubfx r6, r6, #8, #1 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r6, r5 -; CHECK-NEXT: vmov.s8 r6, q1[15] -; CHECK-NEXT: vmov.s8 r5, q0[15] -; CHECK-NEXT: vmov.s8 r4, q1[14] -; CHECK-NEXT: vmov.s8 r2, q0[14] -; CHECK-NEXT: smull r6, r5, r5, r6 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov.s8 r2, q1[14] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov.s8 r3, q3[14] +; CHECK-NEXT: vmov.s8 r5, q1[15] +; CHECK-NEXT: vmov.s8 r4, q3[15] +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r6 -; CHECK-NEXT: vmov r6, r5, d1 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> @@ -2888,16 +2923,23 @@ ; CHECK-NEXT: vand q1, q2, q3 ; CHECK-NEXT: umull r2, r3, r2, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -2922,28 +2964,35 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smull lr, r12, r3, r2 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: smull r2, r3, r2, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adds.w r3, r3, lr @@ -2967,27 +3016,34 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov r2, r12, d3 +; CHECK-NEXT: vmov r3, lr, d1 ; CHECK-NEXT: vmov r6, r9, d2 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vmov r5, r11, d0 -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, r2, d4 -; CHECK-NEXT: csetm r12, eq -; CHECK-NEXT: umull r4, r7, r5, r6 -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: vmov q2[2], q2[0], r2, r12 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 -; CHECK-NEXT: vmov r2, r12, d3 ; CHECK-NEXT: umull r10, r8, r3, r2 +; CHECK-NEXT: umull r4, r7, r5, r6 ; CHECK-NEXT: mla r3, r3, r12, r8 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r10 ; CHECK-NEXT: mla r2, lr, r2, r3 ; CHECK-NEXT: mla r3, r5, r9, r7 ; CHECK-NEXT: mla r3, r11, r6, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, r7, d5 +; CHECK-NEXT: orrs r2, r7 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: vmov r7, r6, d0 ; CHECK-NEXT: adds r2, r2, r7 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll @@ -509,35 +509,15 @@ } define arm_aapcs_vfpcc <2 x i64> @v2i1and_vmov(<2 x i64> %a, <2 x i64> %b, i32 %c) { -; CHECKBE-LABEL: v2i1and_vmov: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: .vsave {d8, d9} -; CHECKBE-NEXT: vpush {d8, d9} -; CHECKBE-NEXT: cmp r0, #0 -; CHECKBE-NEXT: adr r1, .LCPI37_0 -; CHECKBE-NEXT: cset r0, eq -; CHECKBE-NEXT: vldrw.u32 q3, [r1] -; CHECKBE-NEXT: vmov.32 q4[3], r0 -; CHECKBE-NEXT: rsbs r0, r0, #0 -; CHECKBE-NEXT: vand q3, q4, q3 -; CHECKBE-NEXT: vmov.i8 q2, #0xff -; CHECKBE-NEXT: vmov r1, s15 -; CHECKBE-NEXT: vmov q3[2], q3[0], r0, r1 -; CHECKBE-NEXT: vmov q3[3], q3[1], r0, r1 -; CHECKBE-NEXT: vrev64.32 q4, q3 -; CHECKBE-NEXT: veor q2, q4, q2 -; CHECKBE-NEXT: vand q0, q0, q4 -; CHECKBE-NEXT: vand q1, q1, q2 -; CHECKBE-NEXT: vorr q0, q0, q1 -; CHECKBE-NEXT: vpop {d8, d9} -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI37_0: -; CHECKBE-NEXT: .zero 4 -; CHECKBE-NEXT: .long 1 @ 0x1 -; CHECKBE-NEXT: .zero 4 -; CHECKBE-NEXT: .long 0 @ 0x0 +; CHECK-LABEL: v2i1and_vmov: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr entry: %c1 = icmp eq i32 %c, zeroinitializer %broadcast.splatinsert1967 = insertelement <2 x i1> undef, i1 %c1, i32 0 @@ -548,45 +528,15 @@ } define arm_aapcs_vfpcc <2 x i64> @v2i1or_vmov(<2 x i64> %a, <2 x i64> %b, i32 %c) { -; CHECKLE-LABEL: v2i1or_vmov: -; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: cmp r0, #0 -; CHECKLE-NEXT: vldr s8, .LCPI38_0 -; CHECKLE-NEXT: csetm r0, eq -; CHECKLE-NEXT: vmov s10, r0 -; CHECKLE-NEXT: vmov.f32 s9, s8 -; CHECKLE-NEXT: vmov.f32 s11, s10 -; CHECKLE-NEXT: vbic q1, q1, q2 -; CHECKLE-NEXT: vand q0, q0, q2 -; CHECKLE-NEXT: vorr q0, q0, q1 -; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 2 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI38_0: -; CHECKLE-NEXT: .long 0xffffffff @ float NaN -; -; CHECKBE-LABEL: v2i1or_vmov: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: .vsave {d8, d9} -; CHECKBE-NEXT: vpush {d8, d9} -; CHECKBE-NEXT: cmp r0, #0 -; CHECKBE-NEXT: vldr s8, .LCPI38_0 -; CHECKBE-NEXT: csetm r0, eq -; CHECKBE-NEXT: vmov.i8 q3, #0xff -; CHECKBE-NEXT: vmov s10, r0 -; CHECKBE-NEXT: vmov.f32 s9, s8 -; CHECKBE-NEXT: vmov.f32 s11, s10 -; CHECKBE-NEXT: vrev64.32 q4, q2 -; CHECKBE-NEXT: veor q2, q4, q3 -; CHECKBE-NEXT: vand q0, q0, q4 -; CHECKBE-NEXT: vand q1, q1, q2 -; CHECKBE-NEXT: vorr q0, q0, q1 -; CHECKBE-NEXT: vpop {d8, d9} -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 2 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI38_0: -; CHECKBE-NEXT: .long 0xffffffff @ float NaN +; CHECK-LABEL: v2i1or_vmov: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r1, #255 +; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr entry: %c1 = icmp eq i32 %c, zeroinitializer %broadcast.splatinsert1967 = insertelement <2 x i1> undef, i1 %c1, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpsel.ll b/llvm/test/CodeGen/Thumb2/mve-vpsel.ll --- a/llvm/test/CodeGen/Thumb2/mve-vpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vpsel.ll @@ -40,17 +40,17 @@ define arm_aapcs_vfpcc <2 x i64> @vpsel_i64(<2 x i64> %mask, <2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: vpsel_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r12, r3, d1 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs.w r1, r2, r3 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: bfi r2, r0, #0, #8 +; CHECK-NEXT: orrs.w r0, r12, r3 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = icmp ne <2 x i64> %mask, zeroinitializer @@ -85,17 +85,17 @@ define arm_aapcs_vfpcc <2 x double> @vpsel_f64(<2 x i64> %mask, <2 x double> %src1, <2 x double> %src2) { ; CHECK-LABEL: vpsel_f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r12, r3, d1 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs.w r1, r2, r3 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: bfi r2, r0, #0, #8 +; CHECK-NEXT: orrs.w r0, r12, r3 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = icmp ne <2 x i64> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll @@ -164,46 +164,46 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_smaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: mvn r2, #-2147483648 -; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: subs.w r0, r0, r12 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: mov.w r2, #-1 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r3, r1, #0, #8 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: subs.w r1, r1, r12 +; CHECK-NEXT: sbcs r1, r2, #0 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: adr r0, .LCPI12_0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 -; CHECK-NEXT: sbcs.w r0, r2, r1 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: mov.w r3, #-1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r3, r2 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r3 +; CHECK-NEXT: sbcs.w r1, r3, r2 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: adr r0, .LCPI12_1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -228,46 +228,46 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_sminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: mov.w r12, #-1 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 -; CHECK-NEXT: sbcs.w r0, r2, r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: sbcs.w r0, r12, r1 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r3, r1, #0, #8 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r3 -; CHECK-NEXT: mvn r2, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r12, r2 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: adr r0, .LCPI13_0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: subs r0, r0, r2 -; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: adr r1, .LCPI13_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: mvn r3, #-2147483648 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: sbcs r1, r2, #0 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: adr r0, .LCPI13_1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -292,24 +292,24 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_umaxmin(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_umaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r1, #-1 -; CHECK-NEXT: sbcs r1, r2, #0 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r2, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp ult <2 x i64> %s0, @@ -320,24 +320,24 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_uminmax(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_uminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r1, #-1 -; CHECK-NEXT: sbcs r1, r2, #0 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r2, #0 +; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c2 = icmp ult <2 x i64> %s0, diff --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll @@ -180,51 +180,53 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_smaxmin(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_smaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, r1, d1 ; CHECK-NEXT: mvn r12, #-2147483648 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: asrl r0, r1, #3 -; CHECK-NEXT: asrl r2, r3, #3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, r3, d0 +; CHECK-NEXT: asrl r2, r1, #3 +; CHECK-NEXT: asrl r0, r3, #3 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: subs.w r0, r0, r12 -; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: sbcs r0, r3, #0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 ; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r2, r12 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm lr, ne +; CHECK-NEXT: subs.w r2, r2, r12 +; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: bfi r3, lr, #0, #8 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: adr r0, .LCPI12_0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q1, q2, q1 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 -; CHECK-NEXT: sbcs.w r0, r2, r1 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: mov.w r3, #-1 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r3 +; CHECK-NEXT: sbcs.w r1, r3, r2 ; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r3, r2 +; CHECK-NEXT: cset r1, lt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: adr r0, .LCPI12_1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: @@ -249,53 +251,53 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_sminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r2, r1, d0 ; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: asrl r0, r1, #3 -; CHECK-NEXT: rsbs.w r3, r0, #-2147483648 -; CHECK-NEXT: sbcs.w r3, r12, r1 -; CHECK-NEXT: cset r3, lt -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov r4, r3, d0 +; CHECK-NEXT: asrl r2, r1, #3 +; CHECK-NEXT: mov.w lr, #0 +; CHECK-NEXT: rsbs.w r0, r2, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r12, r1 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: bfi r0, r3, #0, #8 +; CHECK-NEXT: vmov r4, r3, d1 ; CHECK-NEXT: asrl r4, r3, #3 -; CHECK-NEXT: csetm lr, ne -; CHECK-NEXT: rsbs.w r2, r4, #-2147483648 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r0 -; CHECK-NEXT: sbcs.w r2, r12, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 -; CHECK-NEXT: cset r2, lt -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov q0[3], q0[1], r2, lr -; CHECK-NEXT: adr r2, .LCPI13_0 -; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: sbcs.w r5, r12, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: cset r5, lt ; CHECK-NEXT: mvn r2, #-2147483648 -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: bfi r0, r5, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: adr r0, .LCPI13_0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: bfi lr, r0, #0, #8 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: subs r0, r0, r2 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: cset r1, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: bfi lr, r0, #8, #8 ; CHECK-NEXT: adr r0, .LCPI13_1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, lr +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI13_0: @@ -321,27 +323,27 @@ ; CHECK-LABEL: vqshrni64_umaxmin: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: lsrl r0, r1, #3 ; CHECK-NEXT: lsrl r2, r3, #3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: subs.w r2, r2, #-1 +; CHECK-NEXT: sbcs r2, r3, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r2, #-1 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q1, q2, q1 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0 = lshr <2 x i64> %so, @@ -354,27 +356,27 @@ ; CHECK-LABEL: vqshrni64_uminmax: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: lsrl r0, r1, #3 ; CHECK-NEXT: lsrl r2, r3, #3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: subs.w r2, r2, #-1 +; CHECK-NEXT: sbcs r2, r3, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: cset r2, lo +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: subs.w r0, r0, #-1 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: cset r0, lo ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r1, r2, #-1 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vbic q1, q2, q1 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0 = lshr <2 x i64> %so, diff --git a/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll b/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll --- a/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll @@ -170,13 +170,16 @@ define arm_aapcs_vfpcc <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: signbit_mask_v2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr %cond = icmp slt <2 x i64> %a, zeroinitializer %r = select <2 x i1> %cond, <2 x i64> %b, <2 x i64> zeroinitializer @@ -219,13 +222,16 @@ define arm_aapcs_vfpcc <2 x i64> @signbit_setmask_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: signbit_setmask_v2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q2, q1 ; CHECK-NEXT: bx lr %cond = icmp slt <2 x i64> %a, zeroinitializer %r = select <2 x i1> %cond, <2 x i64> , <2 x i64> %b @@ -273,13 +279,22 @@ define arm_aapcs_vfpcc <2 x i64> @not_signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: not_signbit_mask_v2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: mvn.w r0, r0, asr #31 -; CHECK-NEXT: mvn.w r1, r1, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: cmp.w r1, #-1 +; CHECK-NEXT: cset r1, gt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: cmp.w r1, #-1 +; CHECK-NEXT: cset r1, gt +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: bx lr %cond = icmp sgt <2 x i64> %a, %r = select <2 x i1> %cond, <2 x i64> %b, <2 x i64> zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -128,14 +128,14 @@ ; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction: %mul = mul nsw i32 %conv3, %conv1 ; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction: %shr = ashr i32 %mul, 7 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %2 = icmp slt i32 %shr, 127 -; CHECK: LV: Found an estimated cost of 40 for VF 2 For instruction: %spec.select.i = select i1 %2, i32 %shr, i32 127 +; CHECK: LV: Found an estimated cost of 22 for VF 2 For instruction: %spec.select.i = select i1 %2, i32 %shr, i32 127 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %conv4 = trunc i32 %spec.select.i to i8 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1 ; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction: store i8 %conv4, i8* %pDst.addr.010, align 1 ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %dec = add i32 %blkCnt.012, -1 ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cmp.not = icmp eq i32 %dec, 0 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %cmp.not, label %while.end.loopexit, label %while.body -; CHECK: LV: Vector loop of width 2 costs: 74. +; CHECK: LV: Vector loop of width 2 costs: 65. ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ] ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll @@ -8,7 +8,7 @@ ; CHECK-COST-LABEL: test ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction: %or.cond = select i1 %cmp2, i1 true, i1 %cmp3 -; CHECK-COST: LV: Found an estimated cost of 2 for VF 2 For instruction: %or.cond = select i1 %cmp2, i1 true, i1 %cmp3 +; CHECK-COST: LV: Found an estimated cost of 26 for VF 2 For instruction: %or.cond = select i1 %cmp2, i1 true, i1 %cmp3 ; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction: %or.cond = select i1 %cmp2, i1 true, i1 %cmp3 define float @test(float* nocapture readonly %pA, float* nocapture readonly %pB, i32 %blockSize) #0 {