Index: include/llvm/CodeGen/MachineOperand.h =================================================================== --- include/llvm/CodeGen/MachineOperand.h +++ include/llvm/CodeGen/MachineOperand.h @@ -919,6 +919,15 @@ assert(isReg() && "Can only add reg operand to use lists"); return Contents.Reg.Prev != nullptr; } + +public: + bool isOnRegUseListNext() const { + assert(isReg() && "Can only add reg operand to use lists"); + if(Contents.Reg.Next) + return Contents.Reg.Next->Contents.Reg.Next != nullptr; + else + return false; + } }; template <> struct DenseMapInfo { Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Value.h" #include #include #include @@ -50,6 +51,7 @@ "Number of load/store from unscaled generated"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); +STATISTIC(NumAddressComputation, "Number of times Address Computation happened"); // The LdStLimit limits how far we search for load/store pairs. static cl::opt LdStLimit("aarch64-load-store-scan-limit", @@ -158,6 +160,11 @@ mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx); + // Merge Add instruction with ld/st instruction. + MachineBasicBlock::iterator + mergeAddWithLDSTInstruction(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, bool IsPreIdx); + // Find and merge zero store instructions. bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); @@ -1373,12 +1380,154 @@ return NextI; } +// this API will return true if both instructions can fold. +// e.g: ADDXrs x8, x9, x8, lsl #3 +// STRXui xzr, [x8] +// can merge to STRXrox xzr, [x9, x8, lsl #3] +// Both instructions opcodes should 'X' or 'W' +static bool isAddrFoldableInst(MachineBasicBlock::iterator Update, + MachineBasicBlock::iterator I) { + unsigned IOpc = I->getOpcode(); + // will return true if both instructions are ADDXrs and (STRXui or LDRXui). + if (Update->getOpcode() == AArch64::ADDXrs) { + switch(IOpc) { + default: + return false; + case AArch64::STRXui: + case AArch64::LDRXui: + return true; + } + } + // will return true if both instructions are ADDWrs and (STRWui and LDRWui). + else if (Update->getOpcode() == AArch64::ADDWrs) { + switch(IOpc) { + default: + return false; + case AArch64::STRWui: + case AArch64::LDRWui: + return true; + } + } + return false; +} + +// will return aarch64 target opcode for merged(new) instruction. +static unsigned getTargetOpcodeForFoldInst(MachineBasicBlock::iterator Update, + MachineBasicBlock::iterator I) { + unsigned UpdateOpc = Update->getOpcode(); + unsigned IOpc = I->getOpcode(); + if(UpdateOpc == AArch64::ADDXrs || + UpdateOpc == AArch64::ADDWrs) { + switch(IOpc) { + default: + break; + case AArch64::STRXui: + return AArch64::STRXroX; + case AArch64::STRWui: + return AArch64::STRWroX; + case AArch64::LDRXui: + return AArch64::LDRXroX; + case AArch64::LDRWui: + return AArch64::LDRWroX; + } + } + return false; +} + +// here will merge the both ADD and STR/LDR instructions if able to merge both +// and will create new instruction. +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergeAddWithLDSTInstruction(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, + bool IsPreIdx) { + MachineBasicBlock::iterator NextI = I; + // Return the instruction following the merged instruction, which is + // the instruction following our unmerged store. Unless that's the add/sub + // instruction we're merging, in which case it's the one after that. + if (++NextI == Update) + ++NextI; + // getting the immediate value of operand three + int Value = Update->getOperand(3).getImm(); + //calculating the scale value + unsigned int ScaleVal = getMemScale(*I) / Value; + // getting the matching target opcode. + unsigned Opc = getTargetOpcodeForFoldInst(Update, I); + // here changing the second operand kill status for future use. + // otherwise this operand is deleting. + Update->getOperand(1).setIsKill(false); + Update->getOperand(2).setIsKill(false); + + MachineInstrBuilder MIB; + if (!isPairedLdSt(*I)) { + // Non-paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(Opc)) + .add(I->getOperand(0)) + .add(Update->getOperand(1)) + .add(Update->getOperand(2)) + .addImm(0) + .addImm(Value) + .setMemRefs(I->memoperands()) + .setMIFlags(I->mergeFlagsWith(*Update)); + } else { + // Paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(Opc)) + .add(I->getOperand(0)) + .add(getLdStRegOp(*Update, 0)) + .add(getLdStRegOp(*Update, 1)) + .add(Update->getOperand(2)) + .addImm(0) + .addImm(ScaleVal) + .setMemRefs(I->memoperands()) + .setMIFlags(I->mergeFlagsWith(*I)); + } + + ++NumAddressComputation; + LLVM_DEBUG(dbgs() << "Creating Address Computation."); + LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); + LLVM_DEBUG(I->print(dbgs())); + LLVM_DEBUG(dbgs() << " "); + LLVM_DEBUG(Update->print(dbgs())); + LLVM_DEBUG(dbgs() << " with instruction:\n "); + LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); + LLVM_DEBUG(dbgs() << "\n"); + + // Operand one is using in any instructions in below, + // we have to change the position of ADD(Update) instruction.i + // because of this adding the new instruction(same as ADD) + // after new merged instruction. + + if(I->getOperand(1).isOnRegUseListNext()) { + MachineBasicBlock *MBB = I->getParent(); + MBB->splice(I, MBB, &*Update); + } + else { + Update->eraseFromParent(); + } + // Erase the old instructions for the block. + I->eraseFromParent(); + return NextI; +} + bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset) { switch (MI.getOpcode()) { default: break; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + { + if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isReg() || !MI.getOperand(3).isImm()) + break; + int ShiftValue = MI.getOperand(3).getImm(); + if (ShiftValue > 4) + break; + // The update instruction source and destination register must be the + // same as the load/store base register. + if (MI.getOperand(0).getReg() != BaseReg || Offset != 0) + break; + return true; + } case AArch64::SUBXri: case AArch64::ADDXri: // Make sure it's a vanilla immediate operand, not a relocation or @@ -1633,9 +1782,12 @@ // ldr x0, [x20], #32 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); - return true; + if (Update->getOpcode() == AArch64::ADDXri || + Update->getOpcode() == AArch64::SUBXri) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); + return true; + } } // Don't know how to handle unscaled pre/post-index versions below, so bail. @@ -1643,15 +1795,28 @@ return false; // Look back to try to find a pre-index instruction. For example, - // add x0, x0, #8 - // ldr x1, [x0] - // merged into: - // ldr x1, [x0, #8]! Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); - return true; + // if Update Inst Opcode is ADDXrs, + // add x8, x9, x8, lsl #3 + // str xzr, [x8] + // merged into: + // str xzr, [x9,x8, lsl #3] + // if Update(ADD inst) opcode is either ADDXrs or ADDWrs + if (isAddrFoldableInst(Update, MBBI)) { + MBBI = mergeAddWithLDSTInstruction(MBBI, Update, true); + return true; + } + // add x0, x0, #8 + // ldr x1, [x0] + // merged into: + // ldr x1, [x0, #8]! + else if (Update->getOpcode() == AArch64::ADDXri || + Update->getOpcode() == AArch64::SUBXri) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + return true; + } } // The immediate in the load/store is scaled by the size of the memory @@ -1666,9 +1831,12 @@ // ldr x1, [x0, #64]! Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); - return true; + if(Update->getOpcode() == AArch64::ADDXri || + Update->getOpcode() == AArch64::SUBXri) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + return true; + } } return false; Index: test/CodeGen/AArch64/fold_addressing_modes_aarch64.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fold_addressing_modes_aarch64.ll @@ -0,0 +1,47 @@ +; RUN: llc -o - %s -mtriple=aarch64-arm-none-eabi -verify-machineinstrs | FileCheck %s +; ModuleID = './test_51309.c' +source_filename = "./test_51309.c" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-arm-none-eabi" + +%struct.As = type { i32, i32 } +%struct.Bs = type { i16 } + +@A = external dso_local local_unnamed_addr global [4 x %struct.As], align 4 +@B = external dso_local local_unnamed_addr global %struct.Bs*, align 8 + +; Function Attrs: minsize norecurse nounwind optsizei + +; CHECK_LABEL: @test +; CHECK: adrp +; CHECK: ldr +; CHECK-NEXT: ldrsh +; CHECK-NOT: add +; CHECK-NEXT: str +define dso_local void @test() local_unnamed_addr #0 { + %1 = load %struct.Bs*, %struct.Bs** @B, align 8, !tbaa !2 + %2 = getelementptr inbounds %struct.Bs, %struct.Bs* %1, i64 0, i32 0 + %3 = load i16, i16* %2, align 2, !tbaa !6 + %4 = sext i16 %3 to i64 + %5 = getelementptr inbounds [4 x %struct.As], [4 x %struct.As]* @A, i64 0, i64 %4, i32 0 + %6 = bitcast i32* %5 to <2 x i32>* + store <2 x i32> zeroinitializer, <2 x i32>* %6, align 4, !tbaa !9 + ret void +} + +attributes #0 = { minsize norecurse nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a53" "target-features"="+aes,+crc,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 9.0.0 (https://git.llvm.org/git/clang.git/ 268b249f1d4cbc212d1853ac9821194f868eef36) (https://git.llvm.org/git/llvm.git/ 772398facdeaf5e5f4f8ca641e06f354441ad9ac)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"any pointer", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = !{!7, !8, i64 0} +!7 = !{!"Bs", !8, i64 0} +!8 = !{!"short", !4, i64 0} +!9 = !{!10, !10, i64 0} +!10 = !{!"int", !4, i64 0}