Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -10993,6 +10993,125 @@ DAG.getUNDEF(VT), NewMask); } +static bool isUpdatingVLDorVST1(SDNode *Inst) { + switch(Inst->getOpcode()) { + case ARMISD::VLD1_UPD: + case ARMISD::VST1_UPD: + return true; + default: + return false; + } +} + +static SDValue getIncrementWithOffset(SelectionDAG &DAG, SDValue C, + int64_t Offset, SDLoc DL) { + // If Offset is zero then C may or may not be constant. + if (!Offset) + return C; + + // We should always have constant value C, if offset is not zero. + int64_t NewVal = cast(C.getNode())->getSExtValue() - Offset; + + return DAG.getConstant(NewVal, DL, C.getValueType()); +} + +// Given laod/store access size, address operand (Addr) and address increment +// instruction (Inst) check if we can fold load/store with Inst, and if so +// return increment operand and its value (0 if not constant) +static std::pair getIncrementIfCanFold(SDValue Addr, + SDNode *Inst, + unsigned AccessSize, + int64_t Offset) { + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = Inst->getOperand(Inst->getOperand(0) == Addr ? 1 : 0); + auto *CInc = dyn_cast(Inc.getNode()); + + // Don't select non-constant increment if we have to subtract a + // constant from it. This may result in additional register pressure + if (!CInc && Offset) + return {SDValue(), 0}; + + int64_t CIncSize = CInc ? CInc->getSExtValue() : 0; + if (AccessSize >= 3 * 16 && CIncSize != AccessSize) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. + return {SDValue(), 0}; + } + + // If increment is not greater than offset introduced by VLD/VST upper in the + // call chain we'll be unable to fold such. + if (CInc && CIncSize <= Offset) + return {SDValue(), 0}; + + return {Inc, CIncSize}; +} + +// Find address updating instruction, which we can fold with load/store, +// creating VLD{X}_UPD or VST{X}_UPD. +static std::pair +findAddressUpdateToFold(SelectionDAG &DAG, SDNode *N, SDValue Addr, + unsigned AccessSize) { + int64_t Offset = 0; + SDLoc DL(N); + struct Match { + SDNode *UInst; // Address update instruction + SDValue Inc; // Address increment + int64_t Off; // Offset introduced by cascade vld/vst + } M = {}; + + while (true) { + SDNode *AddrOpInst = Addr.getNode(); + for (SDNode::use_iterator UI = AddrOpInst->use_begin(), + UE = AddrOpInst->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, + // folding it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // We can fold following types of address increment: + // 1. Non-constant and Offset == 0 + // 2. Constant and Inc.second >= Offset + auto Inc = getIncrementIfCanFold(Addr, User, AccessSize, Offset); + + // If we've already found something to return don't bother + if (!M.UInst && Inc.first.getNode()) + M = {User, Inc.first, Offset}; + + // Offset can be negative, so we can occasionally select + // non-constant increment if (AccessSize + Offset) == 0. + if (Inc.second && (Inc.second == AccessSize + Offset)) + // We've found best match possible. + return {User, getIncrementWithOffset(DAG, Inc.first, Offset, DL)}; + } + + // If 'Addr' points to VLD1_UPD or VST1_UPD with fixed post-increment + // then we examine parent address operand as well, keeping track of + // post-increment value + if (!isUpdatingVLDorVST1(AddrOpInst)) + break; + + // Get post-increment value from VST1_UPD or VLD1_UPD. If it is not + // constant don't bother. Otherwise we'll introduce extra register + // operation, because we'll need to subtract constant value from register + // increment. + auto *CInc = dyn_cast(AddrOpInst->getOperand(2).getNode()); + if (!CInc) + break; + + // Update offset with a size of post-increment of command upper in the + // chain. + Offset += CInc->getSExtValue(); + Addr = Addr.getOperand(1); + } + return {M.UInst, getIncrementWithOffset(DAG, M.Inc, M.Off, DL)}; +} + /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, /// NEON load/store intrinsics, and generic vector load/stores, to merge /// base address updates. @@ -11009,195 +11128,174 @@ MemSDNode *MemN = cast(N); SDLoc dl(N); - // Search for a use of the address operand that is an increment. - for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), - UE = Addr.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - if (User->getOpcode() != ISD::ADD || - UI.getUse().getResNo() != Addr.getResNo()) - continue; - - // Check that the add is independent of the load/store. Otherwise, folding - // it would create a cycle. - if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) - continue; - - // Find the new opcode for the updating load/store. - bool isLoadOp = true; - bool isLaneOp = false; - unsigned NewOpc = 0; - unsigned NumVecs = 0; - if (isIntrinsic) { - unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: llvm_unreachable("unexpected intrinsic for Neon base update"); - case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; - NumVecs = 1; break; - case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; - NumVecs = 2; break; - case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; - NumVecs = 3; break; - case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; - NumVecs = 4; break; - case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; - NumVecs = 2; isLaneOp = true; break; - case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; - NumVecs = 3; isLaneOp = true; break; - case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; - NumVecs = 4; isLaneOp = true; break; - case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; isLoadOp = false; break; - case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; - NumVecs = 2; isLoadOp = false; break; - case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; - NumVecs = 3; isLoadOp = false; break; - case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; - NumVecs = 4; isLoadOp = false; break; - case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; - NumVecs = 2; isLoadOp = false; isLaneOp = true; break; - case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; - NumVecs = 3; isLoadOp = false; isLaneOp = true; break; - case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; - NumVecs = 4; isLoadOp = false; isLaneOp = true; break; - } - } else { - isLaneOp = true; - switch (N->getOpcode()) { - default: llvm_unreachable("unexpected opcode for Neon base update"); - case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; - case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; - case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; - case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; - case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; - NumVecs = 1; isLaneOp = false; break; - case ISD::STORE: NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; isLaneOp = false; isLoadOp = false; break; - } - } - - // Find the size of memory referenced by the load/store. - EVT VecTy; - if (isLoadOp) { - VecTy = N->getValueType(0); - } else if (isIntrinsic) { - VecTy = N->getOperand(AddrOpIdx+1).getValueType(); - } else { - assert(isStore && "Node has to be a load, a store, or an intrinsic!"); - VecTy = N->getOperand(1).getValueType(); + // Find the new opcode for the updating load/store. + bool isLoadOp = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoadOp = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; isLoadOp = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; isLoadOp = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; isLoadOp = false; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; isLoadOp = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; isLoadOp = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; isLoadOp = false; isLaneOp = true; break; } - - unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; - if (isLaneOp) - NumBytes /= VecTy.getVectorNumElements(); - - // If the increment is a constant, it must match the memory ref size. - SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); - ConstantSDNode *CInc = dyn_cast(Inc.getNode()); - if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { - // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two - // separate instructions that make it harder to use a non-constant update. - continue; + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode for Neon base update"); + case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; + case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; + case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; + case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; isLaneOp = false; break; + case ISD::STORE: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLaneOp = false; isLoadOp = false; break; } + } - // OK, we found an ADD we can fold into the base update. - // Now, create a _UPD node, taking care of not breaking alignment. - - EVT AlignedVecTy = VecTy; - unsigned Alignment = MemN->getAlignment(); + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoadOp) { + VecTy = N->getValueType(0); + } else if (isIntrinsic) { + VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + } else { + assert(isStore && "Node has to be a load, a store, or an intrinsic!"); + VecTy = N->getOperand(1).getValueType(); + } - // If this is a less-than-standard-aligned load/store, change the type to - // match the standard alignment. - // The alignment is overlooked when selecting _UPD variants; and it's - // easier to introduce bitcasts here than fix that. - // There are 3 ways to get to this base-update combine: - // - intrinsics: they are assumed to be properly aligned (to the standard - // alignment of the memory type), so we don't need to do anything. - // - ARMISD::VLDx nodes: they are only generated from the aforementioned - // intrinsics, so, likewise, there's nothing to do. - // - generic load/store instructions: the alignment is specified as an - // explicit operand, rather than implicitly as the standard alignment - // of the memory type (like the intrisics). We need to change the - // memory type to match the explicit alignment. That way, we don't - // generate non-standard-aligned ARMISD::VLDx nodes. - if (isa(N)) { - if (Alignment == 0) - Alignment = 1; - if (Alignment < VecTy.getScalarSizeInBits() / 8) { - MVT EltTy = MVT::getIntegerVT(Alignment * 8); - assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); - assert(!isLaneOp && "Unexpected generic load/store lane."); - unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); - AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); - } - // Don't set an explicit alignment on regular load/stores that we want - // to transform to VLD/VST 1_UPD nodes. - // This matches the behavior of regular load/stores, which only get an - // explicit alignment if the MMO alignment is larger than the standard - // alignment of the memory type. - // Intrinsics, however, always get an explicit alignment, set to the - // alignment of the MMO. + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + auto AU = findAddressUpdateToFold(DAG, N, Addr, NumBytes); + if (!AU.first) + return SDValue(); + + // OK, we found an ADD we can fold into the base update. + // Now, create a _UPD node, taking care of not breaking alignment. + EVT AlignedVecTy = VecTy; + unsigned Alignment = MemN->getAlignment(); + + // If this is a less-than-standard-aligned load/store, change the type to + // match the standard alignment. + // The alignment is overlooked when selecting _UPD variants; and it's + // easier to introduce bitcasts here than fix that. + // There are 3 ways to get to this base-update combine: + // - intrinsics: they are assumed to be properly aligned (to the standard + // alignment of the memory type), so we don't need to do anything. + // - ARMISD::VLDx nodes: they are only generated from the aforementioned + // intrinsics, so, likewise, there's nothing to do. + // - generic load/store instructions: the alignment is specified as an + // explicit operand, rather than implicitly as the standard alignment + // of the memory type (like the intrisics). We need to change the + // memory type to match the explicit alignment. That way, we don't + // generate non-standard-aligned ARMISD::VLDx nodes. + if (isa(N)) { + if (Alignment == 0) Alignment = 1; - } - - // Create the new updating load/store node. - // First, create an SDVTList for the new updating node's results. - EVT Tys[6]; - unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); - unsigned n; - for (n = 0; n < NumResultVecs; ++n) - Tys[n] = AlignedVecTy; - Tys[n++] = MVT::i32; - Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); - - // Then, gather the new node's operands. - SmallVector Ops; - Ops.push_back(N->getOperand(0)); // incoming chain - Ops.push_back(N->getOperand(AddrOpIdx)); - Ops.push_back(Inc); - - if (StoreSDNode *StN = dyn_cast(N)) { - // Try to match the intrinsic's signature - Ops.push_back(StN->getValue()); - } else { - // Loads (and of course intrinsics) match the intrinsics' signature, - // so just add all but the alignment operand. - for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) - Ops.push_back(N->getOperand(i)); - } + if (Alignment < VecTy.getScalarSizeInBits() / 8) { + MVT EltTy = MVT::getIntegerVT(Alignment * 8); + assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); + assert(!isLaneOp && "Unexpected generic load/store lane."); + unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); + AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); + } + // Don't set an explicit alignment on regular load/stores that we want + // to transform to VLD/VST 1_UPD nodes. + // This matches the behavior of regular load/stores, which only get an + // explicit alignment if the MMO alignment is larger than the standard + // alignment of the memory type. + // Intrinsics, however, always get an explicit alignment, set to the + // alignment of the MMO. + Alignment = 1; + } + + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. + EVT Tys[6]; + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = AlignedVecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); - // For all node types, the alignment operand is always the last one. - Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); + // Then, gather the new node's operands. + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(AU.second); + + if (StoreSDNode *StN = dyn_cast(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + } else { + // Loads (and of course intrinsics) match the intrinsics' signature, + // so just add all but the alignment operand. + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) + Ops.push_back(N->getOperand(i)); + } - // If this is a non-standard-aligned STORE, the penultimate operand is the - // stored value. Bitcast it to the aligned type. - if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { - SDValue &StVal = Ops[Ops.size()-2]; - StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); - } + // For all node types, the alignment operand is always the last one. + Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); - EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, - MemN->getMemOperand()); + // If this is a non-standard-aligned STORE, the penultimate operand is the + // stored value. Bitcast it to the aligned type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { + SDValue &StVal = Ops[Ops.size()-2]; + StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); + } - // Update the uses. - SmallVector NewResults; - for (unsigned i = 0; i < NumResultVecs; ++i) - NewResults.push_back(SDValue(UpdN.getNode(), i)); + EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, + MemN->getMemOperand()); - // If this is an non-standard-aligned LOAD, the first result is the loaded - // value. Bitcast it to the expected result type. - if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { - SDValue &LdVal = NewResults[0]; - LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); - } + // Update the uses. + SmallVector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) + NewResults.push_back(SDValue(UpdN.getNode(), i)); + + // If this is an non-standard-aligned LOAD, the first result is the loaded + // value. Bitcast it to the expected result type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { + SDValue &LdVal = NewResults[0]; + LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); + } - NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain - DCI.CombineTo(N, NewResults); - DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(AU.first, SDValue(UpdN.getNode(), NumResultVecs)); - break; - } return SDValue(); } Index: test/CodeGen/ARM/alloc-no-stack-realign.ll =================================================================== --- test/CodeGen/ARM/alloc-no-stack-realign.ll +++ test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -8,31 +8,26 @@ define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" { entry: ; CHECK-LABEL: test1: -; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] -; CHECK: mov r[[R2:[0-9]+]], r[[R1]] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: mov r[[R1:[0-9]+]], #32 -; CHECK: mov r[[R2:[0-9]+]], sp -; CHECK: mov r[[R3:[0-9]+]], r[[R2]] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128], r[[R1]] -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #48 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #32 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] +; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: mov r[[R1]], sp +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] +; CHECK: add sp, sp, #64 +; CHECK: bx lr %retval = alloca <16 x float>, align 16 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 store <16 x float> %0, <16 x float>* %retval @@ -44,32 +39,26 @@ define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp { entry: ; CHECK-LABEL: test2: -; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] -; CHECK: mov r[[R2:[0-9]+]], r[[R1]] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: mov r[[R1:[0-9]+]], #32 -; CHECK: mov r[[R2:[0-9]+]], sp -; CHECK: mov r[[R3:[0-9]+]], r[[R2]] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128], r[[R1]] -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] -; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #48 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #32 -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! -; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] - +; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: mov r[[R1]], sp +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] +; CHECK: mov sp, r7 +; CHECK: pop {r7, pc} %retval = alloca <16 x float>, align 16 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 Index: test/CodeGen/ARM/cascade-vld-vst.ll =================================================================== --- test/CodeGen/ARM/cascade-vld-vst.ll +++ test/CodeGen/ARM/cascade-vld-vst.ll @@ -0,0 +1,69 @@ +; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s + +%M = type { [4 x <4 x float>] } + +; Function Attrs: noimplicitfloat noinline norecurse nounwind uwtable +define void @_test_vld1_vst1(%M* %A, %M *%B) { +entry: + %v0p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 0 + %v0 = load <4 x float>, <4 x float>* %v0p + %v1p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 1 + %v1 = load <4 x float>, <4 x float>* %v1p + %v2p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 2 + %v2 = load <4 x float>, <4 x float>* %v2p + %v3p = getelementptr inbounds %M, %M* %A, i32 0, i32 0, i32 3 + %v3 = load <4 x float>, <4 x float>* %v3p + + %s0p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 0 + store <4 x float> %v0, <4 x float>* %s0p + %s1p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 1 + store <4 x float> %v1, <4 x float>* %s1p + %s2p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 2 + store <4 x float> %v2, <4 x float>* %s2p + %s3p = getelementptr inbounds %M, %M* %B, i32 0, i32 0, i32 3 + store <4 x float> %v3, <4 x float>* %s3p + ret void +} + +; CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! +; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! +; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! +; CHECK-NEXT: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK-NEXT: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK-NEXT: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK-NEXT: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK-NEXT: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK-NEXT: mov pc, lr + +; This function compiles into DAG with load instruction having negative +; address increment. We can't optimize this now, just test that we don't +; crash and generate something meaningful +define void @load_negative(<4 x i32>* %data, i32* %nn, i32 %ndim) { +entry: + %n.vec = and i32 %ndim, -8 + br label %loop + +loop: + %index = phi i32 [ 0, %entry ], [ %index.next, %loop ] + %vec.phi = phi <4 x i32> [ , %entry ], [ %4, %loop ] + %offset.idx = or i32 %index, 1 + %0 = getelementptr inbounds i32, i32* %nn, i32 %offset.idx + %1 = bitcast i32* %0 to <4 x i32>* + %l1 = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = getelementptr i32, i32* %0, i32 4 + %3 = bitcast i32* %2 to <4 x i32>* + %l2 = load <4 x i32>, <4 x i32>* %3, align 4 + %4 = mul nsw <4 x i32> %l1, %vec.phi + %5 = mul nsw <4 x i32> %l2, %vec.phi + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n.vec + br i1 %6, label %res, label %loop + +res: + %r = mul nsw <4 x i32> %4, %5 + store <4 x i32> %r, <4 x i32>* %data + ret void +} + +; CHECK: sub r[[RD:[0-9]+]], {{r[0-9]+}}, #16 +; CHECK: vld1.{{.*}} {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[RD]]] Index: test/CodeGen/ARM/memcpy-inline.ll =================================================================== --- test/CodeGen/ARM/memcpy-inline.ll +++ test/CodeGen/ARM/memcpy-inline.ll @@ -44,15 +44,14 @@ define void @t2(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t2: -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! -; CHECK: movs [[INC:r[0-9]+]], #32 -; CHECK: add.w r3, r0, #16 -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]] -; CHECK: movw [[REG2:r[0-9]+]], #16716 -; CHECK: movt [[REG2:r[0-9]+]], #72 -; CHECK: str [[REG2]], [r0] -; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3] +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! +; CHECK: movw r1, #16716 +; CHECK: movt r1, #72 +; CHECK: str r1, [r0] +; CHECK: bx lr ; CHECK-T1-LABEL: t2: ; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false) Index: test/CodeGen/ARM/misched-fusion-aes.ll =================================================================== --- test/CodeGen/ARM/misched-fusion-aes.ll +++ test/CodeGen/ARM/misched-fusion-aes.ll @@ -74,19 +74,19 @@ ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QA]] ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QF]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]] } @@ -160,19 +160,19 @@ ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QA]] ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QF]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]] } Index: test/CodeGen/ARM/vector-load.ll =================================================================== --- test/CodeGen/ARM/vector-load.ll +++ test/CodeGen/ARM/vector-load.ll @@ -253,10 +253,10 @@ } ; CHECK-LABEL: test_silly_load: -; CHECK: vldr d{{[0-9]+}}, [r0, #16] -; CHECK: movs r1, #24 -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1 -; CHECK: ldr {{r[0-9]+}}, [r0] +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]! +; CHECK-NEXT: vld1.8 {d{{[0-9]+}}}, [r0:64]! +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: bx lr define void @test_silly_load(<28 x i8>* %addr) { load volatile <28 x i8>, <28 x i8>* %addr Index: test/CodeGen/ARM/vext.ll =================================================================== --- test/CodeGen/ARM/vext.ll +++ test/CodeGen/ARM/vext.ll @@ -216,21 +216,18 @@ define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind { ; CHECK-LABEL: test_multisource: ; CHECK: @ BB#0: -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: add r2, r0, #48 -; CHECK-NEXT: add r0, r0, #32 -; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]! -; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] -; CHECK-NEXT: vorr d24, d20, d20 -; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128] -; CHECK-NEXT: vzip.16 d24, d18 -; CHECK-NEXT: vtrn.16 q8, q11 -; CHECK-NEXT: vext.16 d18, d20, d24, #2 -; CHECK-NEXT: vext.16 d16, d18, d16, #2 -; CHECK-NEXT: vext.16 d16, d16, d16, #2 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vld1.16 {d16, d17}, [r0:128]! +; CHECK-NEXT: vld1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vld1.16 {d20, d21}, [r0:128]! +; CHECK-NEXT: vorr d24, d20, d20 +; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128] +; CHECK-NEXT: vzip.16 d24, d22 +; CHECK-NEXT: vtrn.16 q8, q9 +; CHECK-NEXT: vext.16 d18, d20, d24, #2 +; CHECK-NEXT: vext.16 d16, d18, d16, #2 +; CHECK-NEXT: vext.16 d16, d16, d16, #2 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <32 x i16>, <32 x i16>* %B %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 Index: test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll =================================================================== --- test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -198,20 +198,13 @@ ; @testNeon is an important example of the nead for ivchains. ; -; Currently we have two extra add.w's that keep the store address -; live past the next increment because ISEL is unfortunately undoing -; the store chain. ISEL also fails to convert all but one of the stores to -; post-increment addressing. However, the loads should use -; post-increment addressing, no add's or add.w's beyond the three -; mentioned. Most importantly, there should be no spills or reloads! -; ; A9: testNeon: ; A9: %.lr.ph -; A9: add.w r +; A9-NOT: add.w r ; A9-NOT: lsl.w ; A9-NOT: {{ldr|str|adds|add r}} -; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}} -; A9: add.w r +; A9: vst1.8 {{.*}} [r{{[0-9]+}}]! +; A9-NOT: add.w r ; A9-NOT: {{ldr|str|adds|add r}} ; A9-NOT: add.w r ; A9: bne