Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -375,6 +375,10 @@ return AArch64::STURHHi; case AArch64::STURHHi: return AArch64::STURWi; + case AArch64::STURWi: + return AArch64::STURXi; + case AArch64::STRWui: + return AArch64::STRXui; case AArch64::LDRHHui: case AArch64::LDRSHWui: return AArch64::LDRWui; @@ -629,6 +633,14 @@ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } +static bool isPromotableZeroStore(MachineInstr *MI) { + MachineOperand MO = getLdStRegOp(MI); + unsigned Opc = MI->getOpcode(); + return ((isNarrowStore(Opc) || Opc == AArch64::STURWi || + Opc == AArch64::STRWui) && + MO.isReg() && MO.getReg() == AArch64::WZR); +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -774,7 +786,7 @@ // Construct the new instruction. MachineInstrBuilder MIB; - if (isNarrowStore(Opc)) { + if (isPromotableZeroStore(I)) { // Change the scaled offset from small to large type. if (!IsUnscaled) { assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); @@ -782,7 +794,7 @@ } MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), TII->get(getMatchingWideOpcode(Opc))) - .addOperand(getLdStRegOp(I)) + .addReg(isNarrowStore(I) ? AArch64::WZR : AArch64::XZR) .addOperand(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*Paired)); @@ -1091,10 +1103,10 @@ unsigned Reg = getLdStRegOp(FirstMI).getReg(); unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); - bool IsNarrowStore = isNarrowStore(Opc); + bool IsMergeableZeroStore = isPromotableZeroStore(FirstMI); - // For narrow stores, find only the case where the stored value is WZR. - if (IsNarrowStore && Reg != AArch64::WZR) + // For narrow stores, merge only when storing zero. + if (isNarrowStore(Opc) && !IsMergeableZeroStore) return E; // Early exit if the first instruction modifies the base register. @@ -1106,7 +1118,7 @@ // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; - if (!(isNarrowLoad(Opc) || IsNarrowStore) && + if (!(isNarrowLoad(Opc) || IsMergeableZeroStore) && !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) return E; @@ -1173,7 +1185,7 @@ continue; } - if (IsNarrowLoad || IsNarrowStore) { + if (IsNarrowLoad || IsMergeableZeroStore) { // If the alignment requirements of the scaled wide load/store // instruction can't express the offset of the scaled narrow // input, bail and keep looking. @@ -1198,7 +1210,7 @@ // For narrow stores, allow only when the stored value is the same // (i.e., WZR). if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || - (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { + (IsMergeableZeroStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(MI); continue; @@ -1520,7 +1532,7 @@ if (Paired != E) { if (isNarrowLoad(MI)) { ++NumNarrowLoadsPromoted; - } else if (isNarrowStore(MI)) { + } else if (isPromotableZeroStore(MI)) { ++NumZeroStoresPromoted; } else { ++NumPairCreated; @@ -1608,13 +1620,15 @@ case AArch64::LDRSHWui: case AArch64::STRBBui: case AArch64::STRHHui: + case AArch64::STRWui: // Unscaled instructions. case AArch64::LDURBBi: case AArch64::LDURHHi: case AArch64::LDURSBWi: case AArch64::LDURSHWi: case AArch64::STURBBi: - case AArch64::STURHHi: { + case AArch64::STURHHi: + case AArch64::STURWi: { if (tryToMergeLdStInst(MBBI)) { Modified = true; break; Index: test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll =================================================================== --- test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll +++ test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll @@ -331,7 +331,7 @@ } ; CHECK-LABEL: Strh_zero_4 -; CHECK: stp wzr, wzr +; CHECK: str xzr define void @Strh_zero_4(i16* nocapture %P, i32 %n) { entry: %idxprom = sext i32 %n to i64 @@ -352,6 +352,42 @@ ret void } +;CHECK-LABEL: Strw_zero +;CHECK : str xzr +define void @Strw_zero(i32* nocapture %P, i32 %n) { +entry: + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %add = add nsw i32 %n, 1 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1 + store i32 0, i32* %arrayidx2 + ret void +} + +;CHECK-LABEL: Strw_zero_4 +;CHECK : stp xzr +define void @Strw_zero_4(i32* nocapture %P, i32 %n) { +entry: + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %add = add nsw i32 %n, 1 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1 + store i32 0, i32* %arrayidx2 + %add3 = add nsw i32 %n, 2 + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %P, i64 %idxprom4 + store i32 0, i32* %arrayidx5 + %add6 = add nsw i32 %n, 3 + %idxprom7 = sext i32 %add6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32* %P, i64 %idxprom7 + store i32 0, i32* %arrayidx8 + ret void +} + ; CHECK-LABEL: Sturb_zero ; CHECK: sturh wzr define void @Sturb_zero(i8* nocapture %P, i32 %n) #0 { @@ -383,7 +419,7 @@ } ; CHECK-LABEL: Sturh_zero_4 -; CHECK: stp wzr, wzr +; CHECK: stur xzr define void @Sturh_zero_4(i16* nocapture %P, i32 %n) { entry: %sub = add nsw i32 %n, -3 @@ -404,3 +440,42 @@ store i16 0, i16* %arrayidx9 ret void } + +;CHECK-LABEL: Sturw_zero +;CHECK : stur xzr +define void @Sturw_zero(i32* nocapture %P, i32 %n) { +entry: + %sub = add nsw i32 %n, -3 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %sub1 = add nsw i32 %n, -4 + %idxprom2 = sext i32 %sub1 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2 + store i32 0, i32* %arrayidx3 + ret void +} + +;CHECK-LABEL: Sturw_zero_4 +;CHECK : str xzr +define void @Sturw_zero_4(i32* nocapture %P, i32 %n) { +entry: + %sub = add nsw i32 %n, -3 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + store i32 0, i32* %arrayidx + %sub1 = add nsw i32 %n, -4 + %idxprom2 = sext i32 %sub1 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2 + store i32 0, i32* %arrayidx3 + %sub4 = add nsw i32 %n, -2 + %idxprom5 = sext i32 %sub4 to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %P, i64 %idxprom5 + store i32 0, i32* %arrayidx6 + %sub7 = add nsw i32 %n, -1 + %idxprom8 = sext i32 %sub7 to i64 + %arrayidx9 = getelementptr inbounds i32, i32* %P, i64 %idxprom8 + store i32 0, i32* %arrayidx9 + ret void +} +