diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15244,6 +15244,390 @@ DAG.getUNDEF(VT), NewMask); } +/// Load/store instruction that can be merged with a base address +/// update +struct BaseUpdateTarget { + SDNode *N; + bool isIntrinsic; + bool isStore; + unsigned AddrOpIdx; +}; + +struct BaseUpdateUser { + /// Instruction that updates a pointer + SDNode *N; + /// Pointer increment operand + SDValue Inc; + /// Pointer increment value if it is a constant, or 0 otherwise + unsigned ConstInc; +}; + +static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, + struct BaseUpdateUser &User, + bool SimpleConstIncOnly, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + SDNode *N = Target.N; + MemSDNode *MemN = cast(N); + SDLoc dl(N); + + // Find the new opcode for the updating load/store. + bool isLoadOp = true; + bool isLaneOp = false; + // Workaround for vst1x and vld1x intrinsics which do not have alignment + // as an operand. + bool hasAlignment = true; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (Target.isIntrinsic) { + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: + NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; + break; + case Intrinsic::arm_neon_vld2: + NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; + break; + case Intrinsic::arm_neon_vld3: + NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; + break; + case Intrinsic::arm_neon_vld4: + NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; + break; + case Intrinsic::arm_neon_vld1x2: + NewOpc = ARMISD::VLD1x2_UPD; + NumVecs = 2; + hasAlignment = false; + break; + case Intrinsic::arm_neon_vld1x3: + NewOpc = ARMISD::VLD1x3_UPD; + NumVecs = 3; + hasAlignment = false; + break; + case Intrinsic::arm_neon_vld1x4: + NewOpc = ARMISD::VLD1x4_UPD; + NumVecs = 4; + hasAlignment = false; + break; + case Intrinsic::arm_neon_vld2dup: + NewOpc = ARMISD::VLD2DUP_UPD; + NumVecs = 2; + break; + case Intrinsic::arm_neon_vld3dup: + NewOpc = ARMISD::VLD3DUP_UPD; + NumVecs = 3; + break; + case Intrinsic::arm_neon_vld4dup: + NewOpc = ARMISD::VLD4DUP_UPD; + NumVecs = 4; + break; + case Intrinsic::arm_neon_vld2lane: + NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; + isLaneOp = true; + break; + case Intrinsic::arm_neon_vld3lane: + NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; + isLaneOp = true; + break; + case Intrinsic::arm_neon_vld4lane: + NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; + isLaneOp = true; + break; + case Intrinsic::arm_neon_vst1: + NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; + isLoadOp = false; + break; + case Intrinsic::arm_neon_vst2: + NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; + isLoadOp = false; + break; + case Intrinsic::arm_neon_vst3: + NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; + isLoadOp = false; + break; + case Intrinsic::arm_neon_vst4: + NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; + isLoadOp = false; + break; + case Intrinsic::arm_neon_vst2lane: + NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; + isLoadOp = false; + isLaneOp = true; + break; + case Intrinsic::arm_neon_vst3lane: + NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; + isLoadOp = false; + isLaneOp = true; + break; + case Intrinsic::arm_neon_vst4lane: + NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; + isLoadOp = false; + isLaneOp = true; + break; + case Intrinsic::arm_neon_vst1x2: + NewOpc = ARMISD::VST1x2_UPD; + NumVecs = 2; + isLoadOp = false; + hasAlignment = false; + break; + case Intrinsic::arm_neon_vst1x3: + NewOpc = ARMISD::VST1x3_UPD; + NumVecs = 3; + isLoadOp = false; + hasAlignment = false; + break; + case Intrinsic::arm_neon_vst1x4: + NewOpc = ARMISD::VST1x4_UPD; + NumVecs = 4; + isLoadOp = false; + hasAlignment = false; + break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: + llvm_unreachable("unexpected opcode for Neon base update"); + case ARMISD::VLD1DUP: + NewOpc = ARMISD::VLD1DUP_UPD; + NumVecs = 1; + break; + case ARMISD::VLD2DUP: + NewOpc = ARMISD::VLD2DUP_UPD; + NumVecs = 2; + break; + case ARMISD::VLD3DUP: + NewOpc = ARMISD::VLD3DUP_UPD; + NumVecs = 3; + break; + case ARMISD::VLD4DUP: + NewOpc = ARMISD::VLD4DUP_UPD; + NumVecs = 4; + break; + case ISD::LOAD: + NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; + isLaneOp = false; + break; + case ISD::STORE: + NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; + isLaneOp = false; + isLoadOp = false; + break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoadOp) { + VecTy = N->getValueType(0); + } else if (Target.isIntrinsic) { + VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType(); + } else { + assert(Target.isStore && + "Node has to be a load, a store, or an intrinsic!"); + VecTy = N->getOperand(1).getValueType(); + } + + bool isVLDDUPOp = + NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD || + NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD; + + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp || isVLDDUPOp) + NumBytes /= VecTy.getVectorNumElements(); + + if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. + return false; + } + + if (SimpleConstIncOnly && User.ConstInc != NumBytes) + return false; + + // OK, we found an ADD we can fold into the base update. + // Now, create a _UPD node, taking care of not breaking alignment. + + EVT AlignedVecTy = VecTy; + unsigned Alignment = MemN->getAlignment(); + + // If this is a less-than-standard-aligned load/store, change the type to + // match the standard alignment. + // The alignment is overlooked when selecting _UPD variants; and it's + // easier to introduce bitcasts here than fix that. + // There are 3 ways to get to this base-update combine: + // - intrinsics: they are assumed to be properly aligned (to the standard + // alignment of the memory type), so we don't need to do anything. + // - ARMISD::VLDx nodes: they are only generated from the aforementioned + // intrinsics, so, likewise, there's nothing to do. + // - generic load/store instructions: the alignment is specified as an + // explicit operand, rather than implicitly as the standard alignment + // of the memory type (like the intrisics). We need to change the + // memory type to match the explicit alignment. That way, we don't + // generate non-standard-aligned ARMISD::VLDx nodes. + if (isa(N)) { + if (Alignment == 0) + Alignment = 1; + if (Alignment < VecTy.getScalarSizeInBits() / 8) { + MVT EltTy = MVT::getIntegerVT(Alignment * 8); + assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); + assert(!isLaneOp && "Unexpected generic load/store lane."); + unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); + AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); + } + // Don't set an explicit alignment on regular load/stores that we want + // to transform to VLD/VST 1_UPD nodes. + // This matches the behavior of regular load/stores, which only get an + // explicit alignment if the MMO alignment is larger than the standard + // alignment of the memory type. + // Intrinsics, however, always get an explicit alignment, set to the + // alignment of the MMO. + Alignment = 1; + } + + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. + EVT Tys[6]; + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = AlignedVecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); + + // Then, gather the new node's operands. + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(Target.AddrOpIdx)); + Ops.push_back(User.Inc); + + if (StoreSDNode *StN = dyn_cast(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + } else { + // Loads (and of course intrinsics) match the intrinsics' signature, + // so just add all but the alignment operand. + unsigned LastOperand = + hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands(); + for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i) + Ops.push_back(N->getOperand(i)); + } + + // For all node types, the alignment operand is always the last one. + Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); + + // If this is a non-standard-aligned STORE, the penultimate operand is the + // stored value. Bitcast it to the aligned type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { + SDValue &StVal = Ops[Ops.size() - 2]; + StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); + } + + EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, + MemN->getMemOperand()); + + // Update the uses. + SmallVector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) + NewResults.push_back(SDValue(UpdN.getNode(), i)); + + // If this is an non-standard-aligned LOAD, the first result is the loaded + // value. Bitcast it to the expected result type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { + SDValue &LdVal = NewResults[0]; + LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); + } + + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs)); + + return true; +} + +// If (opcode ptr inc) is and ADD-like instruction, return the +// increment value. Otherwise return 0. +static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, + SDValue Inc, const SelectionDAG &DAG) { + ConstantSDNode *CInc = dyn_cast(Inc.getNode()); + if (!CInc) + return 0; + + switch (Opcode) { + case ARMISD::VLD1_UPD: + case ISD::ADD: + return CInc->getZExtValue(); + case ISD::OR: { + if (DAG.haveNoCommonBitsSet(Ptr, Inc)) { + // (OR ptr inc) is the same as (ADD ptr inc) + return CInc->getZExtValue(); + } + return 0; + } + default: + return 0; + } +} + +static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) { + switch (N->getOpcode()) { + case ISD::ADD: + case ISD::OR: { + if (isa(N->getOperand(1))) { + *Ptr = N->getOperand(0); + *CInc = N->getOperand(1); + return true; + } + return false; + } + case ARMISD::VLD1_UPD: { + if (isa(N->getOperand(2))) { + *Ptr = N->getOperand(1); + *CInc = N->getOperand(2); + return true; + } + return false; + } + default: + return false; + } +} + +static bool isValidBaseUpdate(SDNode *N, SDNode *User) { + // Check that the add is independent of the load/store. + // Otherwise, folding it would create a cycle. Search through Addr + // as well, since the User may not be a direct user of Addr and + // only share a base pointer. + SmallPtrSet Visited; + SmallVector Worklist; + Worklist.push_back(N); + Worklist.push_back(User); + if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || + SDNode::hasPredecessorHelper(User, Visited, Worklist)) + return false; + return true; +} + /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, /// NEON load/store intrinsics, and generic vector load/stores, to merge /// base address updates. @@ -15251,237 +15635,89 @@ /// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - SelectionDAG &DAG = DCI.DAG; const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || N->getOpcode() == ISD::INTRINSIC_W_CHAIN); const bool isStore = N->getOpcode() == ISD::STORE; const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); + BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx}; + SDValue Addr = N->getOperand(AddrOpIdx); - MemSDNode *MemN = cast(N); - SDLoc dl(N); + + SmallVector BaseUpdates; // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; - if (User->getOpcode() != ISD::ADD || - UI.getUse().getResNo() != Addr.getResNo()) + if (UI.getUse().getResNo() != Addr.getResNo() || + User->getNumOperands() != 2) continue; - // Check that the add is independent of the load/store. Otherwise, folding - // it would create a cycle. We can avoid searching through Addr as it's a - // predecessor to both. - SmallPtrSet Visited; - SmallVector Worklist; - Visited.insert(Addr.getNode()); - Worklist.push_back(N); - Worklist.push_back(User); - if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || - SDNode::hasPredecessorHelper(User, Visited, Worklist)) - continue; - - // Find the new opcode for the updating load/store. - bool isLoadOp = true; - bool isLaneOp = false; - // Workaround for vst1x and vld1x intrinsics which do not have alignment - // as an operand. - bool hasAlignment = true; - unsigned NewOpc = 0; - unsigned NumVecs = 0; - if (isIntrinsic) { - unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: llvm_unreachable("unexpected intrinsic for Neon base update"); - case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; - NumVecs = 1; break; - case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; - NumVecs = 2; break; - case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; - NumVecs = 3; break; - case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; - NumVecs = 4; break; - case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD; - NumVecs = 2; hasAlignment = false; break; - case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD; - NumVecs = 3; hasAlignment = false; break; - case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD; - NumVecs = 4; hasAlignment = false; break; - case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD; - NumVecs = 2; break; - case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD; - NumVecs = 3; break; - case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD; - NumVecs = 4; break; - case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; - NumVecs = 2; isLaneOp = true; break; - case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; - NumVecs = 3; isLaneOp = true; break; - case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; - NumVecs = 4; isLaneOp = true; break; - case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; isLoadOp = false; break; - case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; - NumVecs = 2; isLoadOp = false; break; - case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; - NumVecs = 3; isLoadOp = false; break; - case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; - NumVecs = 4; isLoadOp = false; break; - case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; - NumVecs = 2; isLoadOp = false; isLaneOp = true; break; - case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; - NumVecs = 3; isLoadOp = false; isLaneOp = true; break; - case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; - NumVecs = 4; isLoadOp = false; isLaneOp = true; break; - case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD; - NumVecs = 2; isLoadOp = false; hasAlignment = false; break; - case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD; - NumVecs = 3; isLoadOp = false; hasAlignment = false; break; - case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD; - NumVecs = 4; isLoadOp = false; hasAlignment = false; break; - } - } else { - isLaneOp = true; - switch (N->getOpcode()) { - default: llvm_unreachable("unexpected opcode for Neon base update"); - case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; - case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; - case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; - case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; - case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; - NumVecs = 1; isLaneOp = false; break; - case ISD::STORE: NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; isLaneOp = false; isLoadOp = false; break; - } - } - - // Find the size of memory referenced by the load/store. - EVT VecTy; - if (isLoadOp) { - VecTy = N->getValueType(0); - } else if (isIntrinsic) { - VecTy = N->getOperand(AddrOpIdx+1).getValueType(); - } else { - assert(isStore && "Node has to be a load, a store, or an intrinsic!"); - VecTy = N->getOperand(1).getValueType(); - } - - bool isVLDDUPOp = - NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD || - NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD; - - unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; - if (isLaneOp || isVLDDUPOp) - NumBytes /= VecTy.getVectorNumElements(); - - // If the increment is a constant, it must match the memory ref size. - SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); - ConstantSDNode *CInc = dyn_cast(Inc.getNode()); - if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { - // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two - // separate instructions that make it harder to use a non-constant update. - continue; - } + SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1); + unsigned ConstInc = + getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG); - // OK, we found an ADD we can fold into the base update. - // Now, create a _UPD node, taking care of not breaking alignment. - - EVT AlignedVecTy = VecTy; - unsigned Alignment = MemN->getAlignment(); - - // If this is a less-than-standard-aligned load/store, change the type to - // match the standard alignment. - // The alignment is overlooked when selecting _UPD variants; and it's - // easier to introduce bitcasts here than fix that. - // There are 3 ways to get to this base-update combine: - // - intrinsics: they are assumed to be properly aligned (to the standard - // alignment of the memory type), so we don't need to do anything. - // - ARMISD::VLDx nodes: they are only generated from the aforementioned - // intrinsics, so, likewise, there's nothing to do. - // - generic load/store instructions: the alignment is specified as an - // explicit operand, rather than implicitly as the standard alignment - // of the memory type (like the intrisics). We need to change the - // memory type to match the explicit alignment. That way, we don't - // generate non-standard-aligned ARMISD::VLDx nodes. - if (isa(N)) { - if (Alignment == 0) - Alignment = 1; - if (Alignment < VecTy.getScalarSizeInBits() / 8) { - MVT EltTy = MVT::getIntegerVT(Alignment * 8); - assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); - assert(!isLaneOp && "Unexpected generic load/store lane."); - unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); - AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); - } - // Don't set an explicit alignment on regular load/stores that we want - // to transform to VLD/VST 1_UPD nodes. - // This matches the behavior of regular load/stores, which only get an - // explicit alignment if the MMO alignment is larger than the standard - // alignment of the memory type. - // Intrinsics, however, always get an explicit alignment, set to the - // alignment of the MMO. - Alignment = 1; - } + if (ConstInc || User->getOpcode() == ISD::ADD) + BaseUpdates.push_back({User, Inc, ConstInc}); + } - // Create the new updating load/store node. - // First, create an SDVTList for the new updating node's results. - EVT Tys[6]; - unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); - unsigned n; - for (n = 0; n < NumResultVecs; ++n) - Tys[n] = AlignedVecTy; - Tys[n++] = MVT::i32; - Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); + // If the address is a constant pointer increment itself, find + // another constant increment that has the same base operand + SDValue Base; + SDValue CInc; + if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) { + unsigned Offset = + getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG); + for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end(); + UI != UE; ++UI) { - // Then, gather the new node's operands. - SmallVector Ops; - Ops.push_back(N->getOperand(0)); // incoming chain - Ops.push_back(N->getOperand(AddrOpIdx)); - Ops.push_back(Inc); + SDNode *User = *UI; + if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() || + User->getNumOperands() != 2) + continue; - if (StoreSDNode *StN = dyn_cast(N)) { - // Try to match the intrinsic's signature - Ops.push_back(StN->getValue()); - } else { - // Loads (and of course intrinsics) match the intrinsics' signature, - // so just add all but the alignment operand. - unsigned LastOperand = - hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands(); - for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i) - Ops.push_back(N->getOperand(i)); - } + SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0); + unsigned UserOffset = + getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG); - // For all node types, the alignment operand is always the last one. - Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); + if (!UserOffset || UserOffset <= Offset) + continue; - // If this is a non-standard-aligned STORE, the penultimate operand is the - // stored value. Bitcast it to the aligned type. - if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { - SDValue &StVal = Ops[Ops.size()-2]; - StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); + unsigned NewConstInc = UserOffset - Offset; + SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32); + BaseUpdates.push_back({User, NewInc, NewConstInc}); } + } - EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, - MemN->getMemOperand()); - - // Update the uses. - SmallVector NewResults; - for (unsigned i = 0; i < NumResultVecs; ++i) - NewResults.push_back(SDValue(UpdN.getNode(), i)); - - // If this is an non-standard-aligned LOAD, the first result is the loaded - // value. Bitcast it to the expected result type. - if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { - SDValue &LdVal = NewResults[0]; - LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); + // Try to fold the load/store with an update that matches memory + // access size. This should work well for sequential loads. + // + // Filter out invalid updates as well. + unsigned NumValidUpd = BaseUpdates.size(); + for (unsigned I = 0; I < NumValidUpd;) { + BaseUpdateUser &User = BaseUpdates[I]; + if (!isValidBaseUpdate(N, User.N)) { + --NumValidUpd; + std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]); + continue; } - NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain - DCI.CombineTo(N, NewResults); - DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); - - break; + if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI)) + return SDValue(); + ++I; + } + BaseUpdates.resize(NumValidUpd); + + // Try to fold with other users. Non-constant updates are considered + // first, and constant updates are sorted to not break a sequence of + // strided accesses (if there is any). + std::sort(BaseUpdates.begin(), BaseUpdates.end(), + [](BaseUpdateUser &LHS, BaseUpdateUser &RHS) { + return LHS.ConstInc < RHS.ConstInc; + }); + for (BaseUpdateUser &User : BaseUpdates) { + if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI)) + return SDValue(); } return SDValue(); } diff --git a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll --- a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll +++ b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -3,80 +3,57 @@ ; rdar://12713765 ; When realign-stack is set to false, make sure we are not creating stack ; objects that are assumed to be 64-byte aligned. -@T3_retval = common global <16 x float> zeroinitializer, align 16 define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" { -entry: ; CHECK-LABEL: test1: -; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] -; CHECK: mov r[[R2:[0-9]+]], r[[R1]] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R3:[0-9]+]], r[[R1]], #32 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: add r[[R3:[0-9]+]], r[[R1]], #48 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: mov r[[R2:[0-9]+]], sp -; CHECK: add r[[R3:[0-9]+]], r[[R2]], #48 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: add r[[R4:[0-9]+]], r[[R2]], #32 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128] -; CHECK: mov r[[R5:[0-9]+]], r[[R2]] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #48 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #32 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] +; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}} +; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp +; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32 +; CHECK: add r[[PTR]], r[[PTR]], #32 +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128] +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128] +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128] +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128] +entry: %retval = alloca <16 x float>, align 64 - %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 - store <16 x float> %0, <16 x float>* %retval - %1 = load <16 x float>, <16 x float>* %retval - store <16 x float> %1, <16 x float>* %agg.result, align 16 + %a1 = bitcast <16 x float>* %retval to float* + %a2 = getelementptr inbounds float, float* %a1, i64 8 + %a3 = bitcast float* %a2 to <4 x float>* + + %b1 = bitcast <16 x float>* %agg.result to float* + %b2 = getelementptr inbounds float, float* %b1, i64 8 + %b3 = bitcast float* %b2 to <4 x float>* + + %0 = load <4 x float>, <4 x float>* %a3, align 16 + %1 = load <4 x float>, <4 x float>* %b3, align 16 + store <4 x float> %0, <4 x float>* %b3, align 16 + store <4 x float> %1, <4 x float>* %a3, align 16 ret void } define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp { -entry: ; CHECK-LABEL: test2: -; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] -; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: mov r[[R1:[0-9]+]], sp -; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #16 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: mov r[[R3:[0-9]+]], #32 -; CHECK: mov r[[R9:[0-9]+]], r[[R1]] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128], r[[R3]] -; CHECK: mov r[[R3:[0-9]+]], r[[R9]] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #48 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #32 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] +; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}} +; CHECK: mov r[[ALIGNED:[0-9]+]], sp +; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32 +; CHECK: add r[[PTR]], r[[PTR]], #32 +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128] +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128] +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128] +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128] +entry: + %retval = alloca <16 x float>, align 64 + %a1 = bitcast <16 x float>* %retval to float* + %a2 = getelementptr inbounds float, float* %a1, i64 8 + %a3 = bitcast float* %a2 to <4 x float>* + %b1 = bitcast <16 x float>* %agg.result to float* + %b2 = getelementptr inbounds float, float* %b1, i64 8 + %b3 = bitcast float* %b2 to <4 x float>* -%retval = alloca <16 x float>, align 64 - %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 - store <16 x float> %0, <16 x float>* %retval - %1 = load <16 x float>, <16 x float>* %retval - store <16 x float> %1, <16 x float>* %agg.result, align 16 + %0 = load <4 x float>, <4 x float>* %a3, align 16 + %1 = load <4 x float>, <4 x float>* %b3, align 16 + store <4 x float> %0, <4 x float>* %b3, align 16 + store <4 x float> %1, <4 x float>* %a3, align 16 ret void } diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll @@ -0,0 +1,325 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -o - < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8-unknown-linux-gnueabihf" + +define <4 x float> @test(float* %A) { +; CHECK-LABEL: test: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: bx lr + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +define <4 x float> @test_stride(float* %A) { +; CHECK-LABEL: test_stride: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #24 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1 +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: bx lr + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 12 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +define <4 x float> @test_stride_mixed(float* %A) { +; CHECK-LABEL: test_stride_mixed: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #24 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: bx lr + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 10 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; Refrain from using multiple stride registers +define <4 x float> @test_stride_noop(float* %A) { +; CHECK-LABEL: test_stride_noop: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #24 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 +; CHECK-NEXT: mov r1, #32 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1 +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: bx lr + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +define <4 x float> @test_positive_initial_offset(float* %A) { +; CHECK-LABEL: test_positive_initial_offset: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: bx lr + %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 12 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 16 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +define <4 x float> @test_negative_initial_offset(float* %A) { +; CHECK-LABEL: test_negative_initial_offset: +; CHECK: @ %bb.0: +; CHECK-NEXT: sub r0, r0, #64 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: bx lr + %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16 + %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 -12 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 -8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +@global_float_array = external global [128 x float], align 4 +define <4 x float> @test_global() { +; CHECK-LABEL: test_global: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, :lower16:global_float_array +; CHECK-NEXT: movt r0, :upper16:global_float_array +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: bx lr + %X = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*), align 4 + %Y = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 12) to <4 x float>*), align 4 + %Z = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 16) to <4 x float>*), align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +define <4 x float> @test_stack() { +; Use huge alignment to test that ADD would not be converted to OR +; CHECK-LABEL: test_stack: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r10, r11, lr} +; CHECK-NEXT: push {r4, r10, r11, lr} +; CHECK-NEXT: .setfp r11, sp, #8 +; CHECK-NEXT: add r11, sp, #8 +; CHECK-NEXT: .pad #240 +; CHECK-NEXT: sub sp, sp, #240 +; CHECK-NEXT: bfc sp, #0, #7 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl external_function +; CHECK-NEXT: vld1.32 {d16, d17}, [r4:128]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r4:128]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128] +; CHECK-NEXT: vadd.f32 q0, q8, q9 +; CHECK-NEXT: sub sp, r11, #8 +; CHECK-NEXT: pop {r4, r10, r11, pc} + %array = alloca [32 x float], align 128 + %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0 + call void @external_function(float* %arraydecay) + %X.ptr = bitcast [32 x float]* %array to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +define <2 x double> @test_double(double* %A) { +; CHECK-LABEL: test_double: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r0, #64 +; CHECK-NEXT: vld1.64 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f64 d20, d17, d19 +; CHECK-NEXT: vadd.f64 d16, d16, d18 +; CHECK-NEXT: vld1.64 {d22, d23}, [r0] +; CHECK-NEXT: vadd.f64 d1, d20, d23 +; CHECK-NEXT: vadd.f64 d0, d16, d22 +; CHECK-NEXT: bx lr + %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8 + %X.ptr = bitcast double* %X.ptr.elt to <2 x double>* + %X = load <2 x double>, <2 x double>* %X.ptr, align 8 + %Y.ptr.elt = getelementptr inbounds double, double* %A, i32 10 + %Y.ptr = bitcast double* %Y.ptr.elt to <2 x double>* + %Y = load <2 x double>, <2 x double>* %Y.ptr, align 8 + %Z.ptr.elt = getelementptr inbounds double, double* %A, i32 12 + %Z.ptr = bitcast double* %Z.ptr.elt to <2 x double>* + %Z = load <2 x double>, <2 x double>* %Z.ptr, align 8 + %tmp.sum = fadd <2 x double> %X, %Y + %sum = fadd <2 x double> %tmp.sum, %Z + ret <2 x double> %sum +} + +define void @test_various_instructions(float* %A) { +; CHECK-LABEL: test_various_instructions: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 +; CHECK-NEXT: vst1.32 {d16, d17}, [r0] +; CHECK-NEXT: bx lr + %X.ptr = bitcast float* %A to i8* + %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1) + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to i8* + %Z = fadd <4 x float> %X, %Y + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %Z.ptr, <4 x float> %Z, i32 4) + ret void +} + +define void @test_lsr_geps(float* %a, float* %b, i32 %n) { +; CHECK-LABEL: test_lsr_geps: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB10_1: @ %for.body.preheader +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: .LBB10_2: @ %for.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add r3, r0, r12 +; CHECK-NEXT: subs r2, r2, #1 +; CHECK-NEXT: vld1.32 {d16, d17}, [r3]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r3]! +; CHECK-NEXT: vld1.32 {d20, d21}, [r3]! +; CHECK-NEXT: vld1.32 {d22, d23}, [r3] +; CHECK-NEXT: add r3, r1, r12 +; CHECK-NEXT: add r12, r12, #64 +; CHECK-NEXT: vst1.32 {d16, d17}, [r3]! +; CHECK-NEXT: vst1.32 {d18, d19}, [r3]! +; CHECK-NEXT: vst1.32 {d20, d21}, [r3]! +; CHECK-NEXT: vst1.32 {d22, d23}, [r3] +; CHECK-NEXT: bne .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %cmp61 = icmp sgt i32 %n, 0 + br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ] + %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %0 = bitcast float* %a to i8* + %1 = bitcast float* %b to i8* + %uglygep19 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1920 = bitcast i8* %uglygep19 to <4 x float>* + %2 = load <4 x float>, <4 x float>* %uglygep1920, align 4 + %uglygep16 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1617 = bitcast i8* %uglygep16 to <4 x float>* + %scevgep18 = getelementptr <4 x float>, <4 x float>* %uglygep1617, i32 1 + %3 = load <4 x float>, <4 x float>* %scevgep18, align 4 + %uglygep13 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1314 = bitcast i8* %uglygep13 to <4 x float>* + %scevgep15 = getelementptr <4 x float>, <4 x float>* %uglygep1314, i32 2 + %4 = load <4 x float>, <4 x float>* %scevgep15, align 4 + %uglygep10 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1011 = bitcast i8* %uglygep10 to <4 x float>* + %scevgep12 = getelementptr <4 x float>, <4 x float>* %uglygep1011, i32 3 + %5 = load <4 x float>, <4 x float>* %scevgep12, align 4 + %uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv1 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %uglygep8, <4 x float> %2, i32 4) + %uglygep6 = getelementptr i8, i8* %1, i32 %lsr.iv1 + %scevgep7 = getelementptr i8, i8* %uglygep6, i32 16 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep7, <4 x float> %3, i32 4) + %uglygep4 = getelementptr i8, i8* %1, i32 %lsr.iv1 + %scevgep5 = getelementptr i8, i8* %uglygep4, i32 32 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep5, <4 x float> %4, i32 4) + %uglygep = getelementptr i8, i8* %1, i32 %lsr.iv1 + %scevgep = getelementptr i8, i8* %uglygep, i32 48 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep, <4 x float> %5, i32 4) + %lsr.iv.next = add i32 %lsr.iv, -1 + %lsr.iv.next2 = add nuw i32 %lsr.iv1, 64 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare void @external_function(float*) +declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind argmemonly diff --git a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll --- a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll +++ b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll @@ -83,16 +83,16 @@ ; SOFT: @ %bb.0: @ %entry ; SOFT-NEXT: push {r11, lr} ; SOFT-NEXT: sub sp, sp, #32 -; SOFT-NEXT: vldr d16, [sp, #40] -; SOFT-NEXT: mov r12, #16 -; SOFT-NEXT: vabs.f16 d16, d16 -; SOFT-NEXT: mov lr, sp -; SOFT-NEXT: vst1.16 {d16}, [lr:64], r12 ; SOFT-NEXT: add r12, sp, #48 ; SOFT-NEXT: vld1.64 {d16, d17}, [r12] +; SOFT-NEXT: add r12, sp, #16 ; SOFT-NEXT: vabs.f16 q8, q8 -; SOFT-NEXT: str r3, [sp, #8] -; SOFT-NEXT: vst1.64 {d16, d17}, [lr] +; SOFT-NEXT: vst1.64 {d16, d17}, [r12] +; SOFT-NEXT: mov r12, sp +; SOFT-NEXT: vldr d16, [sp, #40] +; SOFT-NEXT: vabs.f16 d16, d16 +; SOFT-NEXT: vst1.16 {d16}, [r12:64]! +; SOFT-NEXT: str r3, [r12] ; SOFT-NEXT: bl use ; SOFT-NEXT: add sp, sp, #32 ; SOFT-NEXT: pop {r11, pc} @@ -105,26 +105,26 @@ ; ; SOFTEB-LABEL: test: ; SOFTEB: @ %bb.0: @ %entry -; SOFTEB-NEXT: .save {r11, lr} -; SOFTEB-NEXT: push {r11, lr} +; SOFTEB-NEXT: .save {r4, lr} +; SOFTEB-NEXT: push {r4, lr} ; SOFTEB-NEXT: .pad #32 ; SOFTEB-NEXT: sub sp, sp, #32 ; SOFTEB-NEXT: vldr d16, [sp, #40] -; SOFTEB-NEXT: mov r12, #16 ; SOFTEB-NEXT: mov lr, sp -; SOFTEB-NEXT: str r3, [sp, #8] +; SOFTEB-NEXT: add r4, sp, #48 +; SOFTEB-NEXT: add r12, sp, #16 ; SOFTEB-NEXT: vrev64.16 d16, d16 ; SOFTEB-NEXT: vabs.f16 d16, d16 -; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12 -; SOFTEB-NEXT: add r12, sp, #48 -; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12] +; SOFTEB-NEXT: vst1.16 {d16}, [lr:64]! +; SOFTEB-NEXT: vld1.64 {d16, d17}, [r4] ; SOFTEB-NEXT: vrev64.16 q8, q8 +; SOFTEB-NEXT: str r3, [lr] ; SOFTEB-NEXT: vabs.f16 q8, q8 ; SOFTEB-NEXT: vrev64.16 q8, q8 -; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr] +; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12] ; SOFTEB-NEXT: bl use ; SOFTEB-NEXT: add sp, sp, #32 -; SOFTEB-NEXT: pop {r11, pc} +; SOFTEB-NEXT: pop {r4, pc} ; ; HARDEB-LABEL: test: ; HARDEB: @ %bb.0: @ %entry @@ -148,20 +148,20 @@ ; SOFT-NEXT: push {r11, lr} ; SOFT-NEXT: sub sp, sp, #32 ; SOFT-NEXT: add r12, sp, #80 -; SOFT-NEXT: mov lr, sp ; SOFT-NEXT: vld1.64 {d16, d17}, [r12] ; SOFT-NEXT: add r12, sp, #48 ; SOFT-NEXT: vabs.f16 q8, q8 ; SOFT-NEXT: vld1.64 {d18, d19}, [r12] ; SOFT-NEXT: add r12, sp, #64 -; SOFT-NEXT: str r3, [sp, #8] ; SOFT-NEXT: vadd.f16 q8, q8, q9 ; SOFT-NEXT: vld1.64 {d18, d19}, [r12] -; SOFT-NEXT: mov r12, #16 +; SOFT-NEXT: add r12, sp, #16 ; SOFT-NEXT: vmul.f16 q8, q9, q8 -; SOFT-NEXT: vldr d18, [sp, #40] -; SOFT-NEXT: vst1.16 {d18}, [lr:64], r12 -; SOFT-NEXT: vst1.64 {d16, d17}, [lr] +; SOFT-NEXT: vst1.64 {d16, d17}, [r12] +; SOFT-NEXT: mov r12, sp +; SOFT-NEXT: vldr d16, [sp, #40] +; SOFT-NEXT: vst1.16 {d16}, [r12:64]! +; SOFT-NEXT: str r3, [r12] ; SOFT-NEXT: bl use ; SOFT-NEXT: add sp, sp, #32 ; SOFT-NEXT: pop {r11, pc} @@ -181,13 +181,8 @@ ; SOFTEB-NEXT: push {r11, lr} ; SOFTEB-NEXT: .pad #32 ; SOFTEB-NEXT: sub sp, sp, #32 -; SOFTEB-NEXT: vldr d16, [sp, #40] -; SOFTEB-NEXT: mov r12, #16 -; SOFTEB-NEXT: mov lr, sp -; SOFTEB-NEXT: str r3, [sp, #8] -; SOFTEB-NEXT: vrev64.16 d16, d16 -; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12 ; SOFTEB-NEXT: add r12, sp, #80 +; SOFTEB-NEXT: mov lr, sp ; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12] ; SOFTEB-NEXT: add r12, sp, #48 ; SOFTEB-NEXT: vrev64.16 q8, q8 @@ -197,10 +192,15 @@ ; SOFTEB-NEXT: vrev64.16 q9, q9 ; SOFTEB-NEXT: vadd.f16 q8, q8, q9 ; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12] +; SOFTEB-NEXT: add r12, sp, #16 ; SOFTEB-NEXT: vrev64.16 q9, q9 ; SOFTEB-NEXT: vmul.f16 q8, q9, q8 +; SOFTEB-NEXT: vldr d18, [sp, #40] +; SOFTEB-NEXT: vrev64.16 d18, d18 +; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]! +; SOFTEB-NEXT: str r3, [lr] ; SOFTEB-NEXT: vrev64.16 q8, q8 -; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr] +; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12] ; SOFTEB-NEXT: bl use ; SOFTEB-NEXT: add sp, sp, #32 ; SOFTEB-NEXT: pop {r11, pc} diff --git a/llvm/test/CodeGen/ARM/large-vector.ll b/llvm/test/CodeGen/ARM/large-vector.ll --- a/llvm/test/CodeGen/ARM/large-vector.ll +++ b/llvm/test/CodeGen/ARM/large-vector.ll @@ -26,20 +26,18 @@ define void @test_produce_arg() { ; CHECK-LABEL: test_produce_arg: -; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #32 -; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128] ; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #16 +; CHECK-V7K: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]! ; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128] -; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #24 -; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]] ; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #8 +; CHECK-AAPCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]! ; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]] -; CHECK-APCS: add r[[BASE:[0-9]+]], sp, #60 -; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]] +; CHECK-APCS: mov r[[R4:[0-9]+]], sp ; CHECK-APCS: mov r[[BASE:[0-9]+]], sp -; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #76 +; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #60 +; CHECK-APCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]! ; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]] call <32 x i8> @test_consume_arg([9 x double] undef, <32 x i8> zeroinitializer) diff --git a/llvm/test/CodeGen/ARM/memcpy-inline.ll b/llvm/test/CodeGen/ARM/memcpy-inline.ll --- a/llvm/test/CodeGen/ARM/memcpy-inline.ll +++ b/llvm/test/CodeGen/ARM/memcpy-inline.ll @@ -44,11 +44,10 @@ define void @t2(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t2: -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]! -; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: movs [[INC:r[0-9]+]], #32 -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]] +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! ; CHECK: movw [[REG2:r[0-9]+]], #16716 ; CHECK: movt [[REG2:r[0-9]+]], #72 ; CHECK: str [[REG2]], [r0] diff --git a/llvm/test/CodeGen/ARM/memset-align.ll b/llvm/test/CodeGen/ARM/memset-align.ll --- a/llvm/test/CodeGen/ARM/memset-align.ll +++ b/llvm/test/CodeGen/ARM/memset-align.ll @@ -10,18 +10,17 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: vmov.i32 q8, #0x0 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: mov.w r1, #-1 -; CHECK-NEXT: vmov.i32 q8, #0x0 -; CHECK-NEXT: movs r2, #15 -; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: strd r1, r1, [sp, #8] ; CHECK-NEXT: strd r1, r1, [sp] -; CHECK-NEXT: str r1, [sp, #16] -; CHECK-NEXT: vst1.64 {d16, d17}, [r3], r2 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: str r2, [r3] +; CHECK-NEXT: vst1.64 {d16, d17}, [r2]! +; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: str r1, [sp, #20] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: str.w r1, [sp, #15] ; CHECK-NEXT: bl callee ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll --- a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll +++ b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll @@ -76,13 +76,14 @@ ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]] @@ -93,8 +94,6 @@ ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} - ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]] } @@ -170,13 +169,14 @@ ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]] @@ -187,7 +187,6 @@ ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]] } diff --git a/llvm/test/CodeGen/ARM/vector-load.ll b/llvm/test/CodeGen/ARM/vector-load.ll --- a/llvm/test/CodeGen/ARM/vector-load.ll +++ b/llvm/test/CodeGen/ARM/vector-load.ll @@ -253,9 +253,8 @@ } ; CHECK-LABEL: test_silly_load: -; CHECK: vldr d{{[0-9]+}}, [r0, #16] -; CHECK: movs r1, #24 -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]! +; CHECK: vld1.8 {d{{[0-9]+}}}, [r0:64]! ; CHECK: ldr {{r[0-9]+}}, [r0] define void @test_silly_load(<28 x i8>* %addr) { diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll --- a/llvm/test/CodeGen/ARM/vext.ll +++ b/llvm/test/CodeGen/ARM/vext.ll @@ -216,15 +216,14 @@ define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind { ; CHECK-LABEL: test_multisource: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d18, [r0, #32] -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: vorr d22, d18, d18 -; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]! -; CHECK-NEXT: vldr d19, [r0, #48] -; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] -; CHECK-NEXT: vzip.16 d22, d19 -; CHECK-NEXT: vtrn.16 q8, q10 -; CHECK-NEXT: vext.16 d18, d18, d22, #2 +; CHECK-NEXT: vld1.16 {d16, d17}, [r0:128]! +; CHECK-NEXT: vld1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vld1.16 {d20, d21}, [r0:128]! +; CHECK-NEXT: vorr d23, d20, d20 +; CHECK-NEXT: vldr d22, [r0] +; CHECK-NEXT: vzip.16 d23, d22 +; CHECK-NEXT: vtrn.16 q8, q9 +; CHECK-NEXT: vext.16 d18, d20, d23, #2 ; CHECK-NEXT: vext.16 d16, d18, d16, #2 ; CHECK-NEXT: vext.16 d16, d16, d16, #2 ; CHECK-NEXT: vmov r0, r1, d16 diff --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll --- a/llvm/test/CodeGen/ARM/vselect_imax.ll +++ b/llvm/test/CodeGen/ARM/vselect_imax.ll @@ -134,106 +134,97 @@ %T1_19* %blend, %T0_19* %storeaddr) { ; CHECK-LABEL: func_blend19: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: add r2, r1, #48 -; CHECK-NEXT: mov r8, #0 -; CHECK-NEXT: vld1.64 {d16, d17}, [r2:128] -; CHECK-NEXT: add r2, r0, #48 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vld1.64 {d28, d29}, [r1:128]! ; CHECK-NEXT: mov lr, #0 -; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-NEXT: vmov r2, r12, d16 -; CHECK-NEXT: vmov r6, r7, d17 -; CHECK-NEXT: vmov r4, r5, d18 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs r2, r5, r12 +; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]! +; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]! +; CHECK-NEXT: vld1.64 {d24, d25}, [r0:128]! +; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]! +; CHECK-NEXT: vld1.64 {d26, d27}, [r0:128]! +; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] +; CHECK-NEXT: vmov r0, r12, d16 +; CHECK-NEXT: vmov r1, r2, d18 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vmov r1, r4, d25 +; CHECK-NEXT: sbcs r0, r2, r12 ; CHECK-NEXT: mov r12, #0 -; CHECK-NEXT: vmov r2, r4, d19 +; CHECK-NEXT: vmov r2, r0, d21 ; CHECK-NEXT: movlt r12, #1 ; CHECK-NEXT: cmp r12, #0 -; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: mvnne r12, #0 -; CHECK-NEXT: vld1.64 {d24, d25}, [r5:128]! -; CHECK-NEXT: vld1.64 {d20, d21}, [r5:128] -; CHECK-NEXT: subs r2, r2, r6 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: add r0, r0, #32 -; CHECK-NEXT: vld1.64 {d26, d27}, [r2:128]! -; CHECK-NEXT: vld1.64 {d22, d23}, [r2:128] -; CHECK-NEXT: sbcs r2, r4, r7 -; CHECK-NEXT: vmov r4, r5, d21 -; CHECK-NEXT: movlt r8, #1 -; CHECK-NEXT: vmov r6, r7, d23 -; CHECK-NEXT: cmp r8, #0 -; CHECK-NEXT: mvnne r8, #0 -; CHECK-NEXT: vld1.64 {d28, d29}, [r0:128] -; CHECK-NEXT: add r0, r1, #32 -; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128] -; CHECK-NEXT: vmov r0, r1, d20 -; CHECK-NEXT: vdup.32 d7, r8 -; CHECK-NEXT: vdup.32 d6, r12 -; CHECK-NEXT: subs r4, r6, r4 -; CHECK-NEXT: sbcs r4, r7, r5 -; CHECK-NEXT: vmov r5, r6, d24 -; CHECK-NEXT: vmov r7, r2, d26 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d5, r4 -; CHECK-NEXT: subs r5, r7, r5 -; CHECK-NEXT: sbcs r2, r2, r6 -; CHECK-NEXT: vmov r7, r6, d27 -; CHECK-NEXT: vmov r2, r9, d25 -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mvnne r5, #0 -; CHECK-NEXT: subs r2, r7, r2 -; CHECK-NEXT: sbcs r2, r6, r9 -; CHECK-NEXT: vmov r6, r7, d22 +; CHECK-NEXT: subs r1, r1, r2 +; CHECK-NEXT: sbcs r0, r4, r0 +; CHECK-NEXT: vmov r2, r4, d26 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d1, r0 +; CHECK-NEXT: vmov r0, r1, d22 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: sbcs r0, r4, r1 +; CHECK-NEXT: vmov r4, r5, d31 +; CHECK-NEXT: vmov r0, r1, d29 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d1, r2 -; CHECK-NEXT: vdup.32 d0, r5 -; CHECK-NEXT: vbit q12, q13, q0 -; CHECK-NEXT: subs r0, r6, r0 -; CHECK-NEXT: vmov r2, r6, d28 -; CHECK-NEXT: sbcs r0, r7, r1 -; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: vmov r0, r1, d30 -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: vmov r2, r5, d29 -; CHECK-NEXT: sbcs r0, r6, r1 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r1 +; CHECK-NEXT: vmov r4, r5, d30 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d3, r0 +; CHECK-NEXT: vmov r0, r1, d28 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r1 +; CHECK-NEXT: vmov r4, r5, d24 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d2, r0 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vbit q14, q15, q1 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r1 +; CHECK-NEXT: vmov r1, r4, d17 +; CHECK-NEXT: vmov r5, r6, d19 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d0, r0 +; CHECK-NEXT: vbit q10, q12, q0 +; CHECK-NEXT: subs r1, r5, r1 +; CHECK-NEXT: sbcs r1, r6, r4 +; CHECK-NEXT: vmov r4, r5, d27 +; CHECK-NEXT: vmov r0, r1, d23 ; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: vmov r0, r1, d31 ; CHECK-NEXT: movlt r6, #1 -; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: subs r0, r4, r0 ; CHECK-NEXT: sbcs r0, r5, r1 ; CHECK-NEXT: movlt lr, #1 ; CHECK-NEXT: cmp lr, #0 ; CHECK-NEXT: mvnne lr, #0 ; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: vdup.32 d31, lr ; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: vdup.32 d3, lr -; CHECK-NEXT: vdup.32 d2, r6 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vorr q13, q1, q1 -; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: vdup.32 d4, r7 -; CHECK-NEXT: add r0, r3, #32 -; CHECK-NEXT: vbsl q13, q14, q15 -; CHECK-NEXT: vbit q10, q11, q2 -; CHECK-NEXT: vbit q8, q9, q3 -; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128] -; CHECK-NEXT: add r0, r3, #48 -; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128]! -; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] -; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128] -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: vdup.32 d30, r2 +; CHECK-NEXT: vdup.32 d3, r6 +; CHECK-NEXT: vbit q11, q13, q15 +; CHECK-NEXT: vdup.32 d2, r12 +; CHECK-NEXT: vst1.64 {d28, d29}, [r3:128]! +; CHECK-NEXT: vbit q8, q9, q1 +; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]! +; CHECK-NEXT: vst1.64 {d22, d23}, [r3:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128] +; CHECK-NEXT: pop {r4, r5, r6, lr} ; CHECK-NEXT: mov pc, lr %v0 = load %T0_19, %T0_19* %loadaddr %v1 = load %T0_19, %T0_19* %loadaddr2 @@ -251,213 +242,198 @@ %T1_20* %blend, %T0_20* %storeaddr) { ; CHECK-LABEL: func_blend20: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, sp, #8 -; CHECK-NEXT: add r9, r1, #64 -; CHECK-NEXT: mov r2, #32 -; CHECK-NEXT: add r8, r0, #64 -; CHECK-NEXT: vld1.64 {d16, d17}, [r9:128], r2 -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128], r2 -; CHECK-NEXT: vmov r7, r5, d17 -; CHECK-NEXT: vmov r6, r2, d19 -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vld1.64 {d22, d23}, [r10:128]! -; CHECK-NEXT: subs r7, r6, r7 -; CHECK-NEXT: sbcs r2, r2, r5 -; CHECK-NEXT: vmov r5, r6, d16 -; CHECK-NEXT: vmov r7, r4, d18 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d21, r2 -; CHECK-NEXT: subs r5, r7, r5 -; CHECK-NEXT: sbcs r4, r4, r6 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d20, r4 -; CHECK-NEXT: vmov r2, r4, d23 -; CHECK-NEXT: vbit q8, q9, q10 -; CHECK-NEXT: vld1.64 {d18, d19}, [r11:128]! -; CHECK-NEXT: vmov r7, r5, d19 -; CHECK-NEXT: subs r2, r7, r2 -; CHECK-NEXT: sbcs r2, r5, r4 -; CHECK-NEXT: vmov r5, r7, d18 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d21, r2 -; CHECK-NEXT: vmov r2, r4, d22 -; CHECK-NEXT: subs r2, r5, r2 -; CHECK-NEXT: sbcs r2, r7, r4 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d20, r2 -; CHECK-NEXT: add r2, r0, #48 -; CHECK-NEXT: vbif q9, q11, q10 -; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128] -; CHECK-NEXT: add r2, r1, #48 -; CHECK-NEXT: vld1.64 {d2, d3}, [r2:128] -; CHECK-NEXT: vmov r5, r7, d30 -; CHECK-NEXT: vmov r2, r4, d2 -; CHECK-NEXT: vld1.64 {d26, d27}, [r11:128] -; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128] -; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]! -; CHECK-NEXT: vld1.64 {d22, d23}, [r9:128] -; CHECK-NEXT: vld1.64 {d20, d21}, [r8:128]! -; CHECK-NEXT: vmov r11, r10, d21 -; CHECK-NEXT: subs r2, r5, r2 -; CHECK-NEXT: sbcs r2, r7, r4 -; CHECK-NEXT: vmov r7, r6, d31 -; CHECK-NEXT: vmov r2, r5, d3 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: subs r2, r7, r2 -; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: sbcs r2, r6, r5 -; CHECK-NEXT: vmov r6, r5, d27 -; CHECK-NEXT: vmov r2, r9, d1 -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: vdup.32 d7, r7 -; CHECK-NEXT: vdup.32 d6, r4 -; CHECK-NEXT: subs r2, r6, r2 -; CHECK-NEXT: sbcs r2, r5, r9 -; CHECK-NEXT: vmov r6, r5, d26 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d5, r2 -; CHECK-NEXT: vmov r2, r9, d0 -; CHECK-NEXT: subs r2, r6, r2 -; CHECK-NEXT: sbcs r2, r5, r9 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d4, r2 -; CHECK-NEXT: add r2, r1, #32 -; CHECK-NEXT: vld1.64 {d28, d29}, [r2:128] -; CHECK-NEXT: add r2, r0, #32 -; CHECK-NEXT: vbif q13, q0, q2 -; CHECK-NEXT: add r1, r1, #80 -; CHECK-NEXT: vld1.64 {d0, d1}, [r2:128] -; CHECK-NEXT: vmov r4, r5, d28 -; CHECK-NEXT: vbif q15, q1, q3 -; CHECK-NEXT: add r0, r0, #80 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: vld1.64 {d2, d3}, [r8:128] -; CHECK-NEXT: vmov r9, r8, d25 -; CHECK-NEXT: vld1.64 {d8, d9}, [r0:128] -; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128] -; CHECK-NEXT: vmov r3, r12, d8 -; CHECK-NEXT: subs r2, r2, r4 -; CHECK-NEXT: sbcs r2, r6, r5 -; CHECK-NEXT: vmov r4, r5, d29 -; CHECK-NEXT: vmov r6, r7, d1 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r4, r6, r4 -; CHECK-NEXT: sbcs r4, r7, r5 -; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov lr, r0 +; CHECK-NEXT: vld1.64 {d16, d17}, [r8:128]! +; CHECK-NEXT: add r9, r0, #64 +; CHECK-NEXT: add r10, r1, #64 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: vld1.64 {d22, d23}, [lr:128]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128]! +; CHECK-NEXT: vld1.64 {d20, d21}, [lr:128]! +; CHECK-NEXT: vmov r6, r4, d19 +; CHECK-NEXT: vmov r5, r7, d21 +; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]! +; CHECK-NEXT: vld1.64 {d6, d7}, [r10:128]! +; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]! +; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]! +; CHECK-NEXT: subs r6, r5, r6 +; CHECK-NEXT: sbcs r4, r7, r4 +; CHECK-NEXT: vmov r5, r6, d18 +; CHECK-NEXT: vmov r7, r2, d20 ; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d5, r4 -; CHECK-NEXT: vdup.32 d4, r2 -; CHECK-NEXT: vmov r2, r4, d22 -; CHECK-NEXT: vbit q14, q0, q2 -; CHECK-NEXT: subs r2, r5, r2 -; CHECK-NEXT: sbcs r2, r6, r4 -; CHECK-NEXT: vmov r4, r5, d24 -; CHECK-NEXT: vmov r6, r7, d20 +; CHECK-NEXT: vdup.32 d31, r4 +; CHECK-NEXT: subs r5, r7, r5 +; CHECK-NEXT: sbcs r2, r2, r6 +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r1, r6, r4 -; CHECK-NEXT: vmov r0, r6, d9 -; CHECK-NEXT: sbcs r1, r7, r5 -; CHECK-NEXT: vmov r4, r5, d7 -; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: subs r0, r0, r4 -; CHECK-NEXT: vmov r7, r4, d23 -; CHECK-NEXT: sbcs r0, r6, r5 -; CHECK-NEXT: vmov r5, lr, d6 +; CHECK-NEXT: vdup.32 d30, r2 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r2 +; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d9, r0 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r2 +; CHECK-NEXT: vmov r4, r5, d5 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d8, r0 +; CHECK-NEXT: vmov r0, r2, d7 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r2 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: vdup.32 d11, r0 -; CHECK-NEXT: vmov r0, r6, d3 -; CHECK-NEXT: subs r0, r0, r7 -; CHECK-NEXT: sbcs r0, r6, r4 +; CHECK-NEXT: vmov r0, r2, d6 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r2 +; CHECK-NEXT: vmov r4, r5, d23 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d10, r0 +; CHECK-NEXT: vmov r0, r2, d17 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r2 +; CHECK-NEXT: vmov r4, r5, d22 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d25, r0 +; CHECK-NEXT: vmov r0, r2, d16 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r5, r2 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: subs r4, r11, r9 -; CHECK-NEXT: sbcs r4, r10, r8 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d24, r0 +; CHECK-NEXT: vorr q13, q12, q12 +; CHECK-NEXT: vbsl q13, q11, q8 +; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]! +; CHECK-NEXT: vorr q8, q5, q5 +; CHECK-NEXT: vld1.64 {d28, d29}, [r10:128]! +; CHECK-NEXT: vbsl q8, q2, q3 +; CHECK-NEXT: vld1.64 {d6, d7}, [r8:128]! +; CHECK-NEXT: vld1.64 {d22, d23}, [r8:128] +; CHECK-NEXT: vld1.64 {d4, d5}, [lr:128]! +; CHECK-NEXT: vbif q10, q9, q15 +; CHECK-NEXT: vorr q9, q4, q4 +; CHECK-NEXT: vmov r0, r2, d22 +; CHECK-NEXT: vbsl q9, q1, q0 +; CHECK-NEXT: vld1.64 {d30, d31}, [lr:128] +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: vmov r7, r5, d30 +; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128] +; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128] +; CHECK-NEXT: subs r0, r7, r0 +; CHECK-NEXT: sbcs r0, r5, r2 +; CHECK-NEXT: vmov r5, r4, d24 +; CHECK-NEXT: vmov r0, r7, d28 +; CHECK-NEXT: movlt lr, #1 +; CHECK-NEXT: cmp lr, #0 +; CHECK-NEXT: mvnne lr, #0 +; CHECK-NEXT: subs r0, r5, r0 +; CHECK-NEXT: sbcs r0, r4, r7 +; CHECK-NEXT: vmov r7, r5, d29 +; CHECK-NEXT: vmov r4, r6, d25 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: subs r7, r4, r7 ; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: sbcs r7, r6, r5 +; CHECK-NEXT: vmov r5, r1, d31 +; CHECK-NEXT: vmov r7, r6, d23 ; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: subs r3, r3, r5 -; CHECK-NEXT: sbcs r3, r12, lr -; CHECK-NEXT: mov r3, #0 -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mvnne r3, #0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d10, r3 -; CHECK-NEXT: vdup.32 d1, r4 -; CHECK-NEXT: vorr q2, q5, q5 -; CHECK-NEXT: vdup.32 d0, r1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vbsl q2, q4, q3 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vbif q10, q12, q0 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vdup.32 d7, r0 -; CHECK-NEXT: add r0, r1, #80 -; CHECK-NEXT: vdup.32 d6, r2 -; CHECK-NEXT: vbit q11, q1, q3 -; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128] -; CHECK-NEXT: add r0, r1, #32 -; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128] -; CHECK-NEXT: add r0, r1, #48 -; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128] -; CHECK-NEXT: add r0, r1, #64 -; CHECK-NEXT: vst1.64 {d18, d19}, [r1:128]! -; CHECK-NEXT: vst1.64 {d26, d27}, [r1:128] -; CHECK-NEXT: mov r1, #32 -; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128], r1 +; CHECK-NEXT: subs r7, r5, r7 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: sbcs r1, r1, r6 +; CHECK-NEXT: vmov r6, r2, d5 +; CHECK-NEXT: vmov r1, r7, d7 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: mvnne r5, #0 +; CHECK-NEXT: subs r1, r6, r1 +; CHECK-NEXT: sbcs r1, r2, r7 +; CHECK-NEXT: vmov r6, r7, d4 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: vdup.32 d9, r1 +; CHECK-NEXT: vmov r1, r2, d6 +; CHECK-NEXT: subs r1, r6, r1 +; CHECK-NEXT: sbcs r1, r7, r2 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: vdup.32 d8, r1 +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vbif q2, q3, q4 +; CHECK-NEXT: vdup.32 d7, r5 +; CHECK-NEXT: vdup.32 d9, r4 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vdup.32 d8, r0 +; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]! +; CHECK-NEXT: vbif q12, q14, q4 +; CHECK-NEXT: vdup.32 d6, lr +; CHECK-NEXT: vbit q11, q15, q3 ; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]! +; CHECK-NEXT: subs r1, r6, r1 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: sbcs r1, r7, r2 +; CHECK-NEXT: vmov r1, r2, d3 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs r1, r5, r2 +; CHECK-NEXT: movlt r12, #1 +; CHECK-NEXT: cmp r12, #0 +; CHECK-NEXT: mvnne r12, #0 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: vdup.32 d27, r12 +; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: vdup.32 d26, r6 +; CHECK-NEXT: vorr q10, q13, q13 +; CHECK-NEXT: vbsl q10, q0, q1 +; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]! ; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128] -; CHECK-NEXT: add sp, sp, #8 +; CHECK-NEXT: add r0, r3, #64 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]! +; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128]! +; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128] ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: mov pc, lr %v0 = load %T0_20, %T0_20* %loadaddr %v1 = load %T0_20, %T0_20* %loadaddr2 diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -198,21 +198,13 @@ ; @testNeon is an important example of the nead for ivchains. ; -; Currently we have two extra add.w's that keep the store address -; live past the next increment because ISEL is unfortunately undoing -; the store chain. ISEL also fails to convert all but one of the stores to -; post-increment addressing. However, the loads should use -; post-increment addressing, no add's or add.w's beyond the three -; mentioned. Most importantly, there should be no spills or reloads! +; Loads and stores should use post-increment addressing, no add's or add.w's. +; Most importantly, there should be no spills or reloads! ; ; A9: testNeon: ; A9: %.lr.ph -; A9: add.w r ; A9-NOT: lsl.w ; A9-NOT: {{ldr|str|adds|add r}} -; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}} -; A9: add.w r -; A9-NOT: {{ldr|str|adds|add r}} ; A9-NOT: add.w r ; A9: bne define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {