Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -235,10 +235,6 @@ } } -static bool isNarrowStore(MachineInstr *MI) { - return isNarrowStore(MI->getOpcode()); -} - static bool isNarrowLoad(unsigned Opc) { switch (Opc) { default: @@ -386,6 +382,10 @@ return AArch64::STURHHi; case AArch64::STURHHi: return AArch64::STURWi; + case AArch64::STURWi: + return AArch64::STURXi; + case AArch64::STRWui: + return AArch64::STRXui; case AArch64::LDRHHui: case AArch64::LDRSHWui: return AArch64::LDRWui; @@ -640,6 +640,16 @@ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } +static bool isPromotableZeroStoreOpcode(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi; +} + +static bool isPromotableZeroStoreInst(MachineInstr *MI) { + return (isPromotableZeroStoreOpcode(MI)) && + getLdStRegOp(MI).getReg() == AArch64::WZR; +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MergeMI, @@ -775,12 +785,12 @@ MergeMI->eraseFromParent(); return NextI; } - assert(isNarrowStore(Opc) && "Expected narrow store"); + assert(isPromotableZeroStoreInst(I) && "Expected promotable zero store"); // Construct the new instruction. MachineInstrBuilder MIB; MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) - .addOperand(getLdStRegOp(I)) + .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) .addOperand(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*MergeMI)); @@ -1211,7 +1221,7 @@ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; - bool IsNarrowStore = isNarrowStore(Opc); + bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -1282,7 +1292,7 @@ continue; } - if (IsNarrowLoad || IsNarrowStore) { + if (IsNarrowLoad || IsPromotableZeroStore) { // If the alignment requirements of the scaled wide load/store // instruction can't express the offset of the scaled narrow // input, bail and keep looking. @@ -1307,7 +1317,7 @@ // For narrow stores, allow only when the stored value is the same // (i.e., WZR). if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || - (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { + (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(MI); continue; @@ -1633,24 +1643,27 @@ // store. bool AArch64LoadStoreOpt::tryToMergeLdStInst( MachineBasicBlock::iterator &MBBI) { - assert((isNarrowLoad(MBBI) || isNarrowStore(MBBI)) && "Expected narrow op."); + assert((isNarrowLoad(MBBI) || isPromotableZeroStoreOpcode(MBBI)) && + "Expected narrow op."); MachineInstr *MI = MBBI; MachineBasicBlock::iterator E = MI->getParent()->end(); if (!isCandidateToMergeOrPair(MI)) return false; - // For narrow stores, find only the case where the stored value is WZR. - if (isNarrowStore(MI) && getLdStRegOp(MI).getReg() != AArch64::WZR) + // For promotable zero stores, the stored value should be WZR. + if (isPromotableZeroStoreOpcode(MI) && + getLdStRegOp(MI).getReg() != AArch64::WZR) return false; // Look ahead up to LdStLimit instructions for a mergable instruction. LdStPairFlags Flags; - MachineBasicBlock::iterator MergeMI = findMatchingInsn(MBBI, Flags, LdStLimit); + MachineBasicBlock::iterator MergeMI = + findMatchingInsn(MBBI, Flags, LdStLimit); if (MergeMI != E) { if (isNarrowLoad(MI)) { ++NumNarrowLoadsPromoted; - } else if (isNarrowStore(MI)) { + } else if (isPromotableZeroStoreInst(MI)) { ++NumZeroStoresPromoted; } // Keeping the iterator straight is a pain, so we let the merge routine tell @@ -1765,13 +1778,15 @@ case AArch64::LDRSHWui: case AArch64::STRBBui: case AArch64::STRHHui: + case AArch64::STRWui: // Unscaled instructions. case AArch64::LDURBBi: case AArch64::LDURHHi: case AArch64::LDURSBWi: case AArch64::LDURSHWi: case AArch64::STURBBi: - case AArch64::STURHHi: { + case AArch64::STURHHi: + case AArch64::STURWi: { if (tryToMergeLdStInst(MBBI)) { Modified = true; break; Index: test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll =================================================================== --- test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll +++ test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll @@ -352,6 +352,42 @@ ret void } +;CHECK-LABEL: Strw_zero +;CHECK : str xzr +define void @Strw_zero(i32* nocapture %P, i32 %n) { +entry: + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %add = add nsw i32 %n, 1 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1 + store i32 0, i32* %arrayidx2 + ret void +} + +;CHECK-LABEL: Strw_zero_4 +;CHECK : stp xzr +define void @Strw_zero_4(i32* nocapture %P, i32 %n) { +entry: + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %add = add nsw i32 %n, 1 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1 + store i32 0, i32* %arrayidx2 + %add3 = add nsw i32 %n, 2 + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %P, i64 %idxprom4 + store i32 0, i32* %arrayidx5 + %add6 = add nsw i32 %n, 3 + %idxprom7 = sext i32 %add6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32* %P, i64 %idxprom7 + store i32 0, i32* %arrayidx8 + ret void +} + ; CHECK-LABEL: Sturb_zero ; CHECK: sturh wzr define void @Sturb_zero(i8* nocapture %P, i32 %n) #0 { @@ -404,3 +440,42 @@ store i16 0, i16* %arrayidx9 ret void } + +;CHECK-LABEL: Sturw_zero +;CHECK : stur xzr +define void @Sturw_zero(i32* nocapture %P, i32 %n) { +entry: + %sub = add nsw i32 %n, -3 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %sub1 = add nsw i32 %n, -4 + %idxprom2 = sext i32 %sub1 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2 + store i32 0, i32* %arrayidx3 + ret void +} + +;CHECK-LABEL: Sturw_zero_4 +;CHECK : str xzr +define void @Sturw_zero_4(i32* nocapture %P, i32 %n) { +entry: + %sub = add nsw i32 %n, -3 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %sub1 = add nsw i32 %n, -4 + %idxprom2 = sext i32 %sub1 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2 + store i32 0, i32* %arrayidx3 + %sub4 = add nsw i32 %n, -2 + %idxprom5 = sext i32 %sub4 to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %P, i64 %idxprom5 + store i32 0, i32* %arrayidx6 + %sub7 = add nsw i32 %n, -1 + %idxprom8 = sext i32 %sub7 to i64 + %arrayidx9 = getelementptr inbounds i32, i32* %P, i64 %idxprom8 + store i32 0, i32* %arrayidx9 + ret void +} +