Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -43,6 +43,7 @@ "Number of load/store from unscaled generated"); STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); +STATISTIC(NumLoadsFromStoredPromoted, "Number of loads from stored promoted"); static cl::opt ScanLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); @@ -93,6 +94,13 @@ MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, LdStPairFlags &Flags, unsigned Limit); + + // Scan the instructions looking for a store that writes to the address + // from which the current load instruction reads. + // Return true if one is found. + bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &Stored); + // Merge the two instructions indicated into a single pair-wise instruction. // If MergeForward is true, erase the first instruction and fold its // operation into the second. If false, the reverse. Return the instruction @@ -102,6 +110,11 @@ MachineBasicBlock::iterator Paired, const LdStPairFlags &Flags); + // Promote the load that reads directly from the address stored. + MachineBasicBlock::iterator + promoteLoadFromStored(MachineBasicBlock::iterator LoadInst, + MachineBasicBlock::iterator StoreInst); + // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. @@ -128,6 +141,9 @@ // Find and merge foldable ldr/str instructions. bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and promote load instructions which read directly from stored. + bool tryToPromoteLoadFromStored(MachineBasicBlock::iterator &MBBI); + // Check if converting two narrow loads into a single wider load with // bitfield extracts could be enabled. bool enableNarrowLdMerge(MachineFunction &Fn); @@ -399,6 +415,36 @@ } } +static unsigned isMatchingStore(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + unsigned LdOpc = LoadInst->getOpcode(); + unsigned StOpc = StoreInst->getOpcode(); + switch (LdOpc) { + default: + llvm_unreachable("Unsupported load instruction!"); + case AArch64::LDRBBui: + return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || + StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURBBi: + return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || + StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRHHui: + return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || + StOpc == AArch64::STRXui; + case AArch64::LDURHHi: + return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || + StOpc == AArch64::STURXi; + case AArch64::LDRWui: + return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURWi: + return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRXui: + return StOpc == AArch64::STRXui; + case AArch64::LDURXi: + return StOpc == AArch64::STURXi; + } +} + static unsigned getPreIndexedOpcode(unsigned Opc) { switch (Opc) { default: @@ -553,6 +599,21 @@ return MI->getOperand(Idx); } +static bool isLdOffsetInRangeOfStored(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); + int LoadSize = getMemScale(LoadInst); + int StoreSize = getMemScale(StoreInst); + int UnscaledStOffset = isUnscaledLdSt(StoreInst) + ? getLdStOffsetOp(StoreInst).getImm() + : getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + ? getLdStOffsetOp(LoadInst).getImm() + : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + return (UnscaledStOffset <= UnscaledLdOffset) && + (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); +} + // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, MachineInstr *Op1) { @@ -800,6 +861,107 @@ return NextI; } +MachineBasicBlock::iterator AArch64LoadStoreOpt::promoteLoadFromStored( + MachineBasicBlock::iterator LoadInst, + MachineBasicBlock::iterator StoreInst) { + MachineBasicBlock::iterator NextI = LoadInst; + ++NextI; + + int LoadSize = getMemScale(LoadInst); + int StoreSize = getMemScale(StoreInst); + unsigned LdRt = getLdStRegOp(LoadInst).getReg(); + unsigned StRt = getLdStRegOp(StoreInst).getReg(); + bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); + + assert((IsStoreXReg || + TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && + "Unexpected RegClass"); + + MachineInstr *BitExtMI; + if (LoadSize == StoreSize) { + // Remove the load, if the destination register of the loads is the same + // register for stored value. + if (StRt == LdRt) { + DEBUG(dbgs() << "Remove load instruction:\n "); + DEBUG(LoadInst->print(dbgs())); + DEBUG(dbgs() << "\n"); + LoadInst->eraseFromParent(); + return NextI; + } + // Replace the load with a mov if the load and store are in the same size. + BitExtMI = + BuildMI(*LoadInst->getParent(), LoadInst, LoadInst->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) + .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) + .addReg(StRt) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // FIXME: Currently we disable this transformation in big-endian targets as + // performance and correctness are verified only in little-endian. + if (!Subtarget->isLittleEndian()) + return NextI; + bool IsUnscaled = isUnscaledLdSt(LoadInst); + assert(IsUnscaled == isUnscaledLdSt(StoreInst) && + "Unsupported ld/st match"); + assert(LoadSize < StoreSize && "Invalid load size"); + int UnscaledLdOffset = IsUnscaled + ? getLdStOffsetOp(LoadInst).getImm() + : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + int UnscaledStOffset = + IsUnscaled ? getLdStOffsetOp(StoreInst).getImm() + : getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int Width = LoadSize * 8; + int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + int Imms = Immr + Width - 1; + unsigned DestReg = IsStoreXReg + ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, + &AArch64::GPR64RegClass) + : LdRt; + + assert(((UnscaledLdOffset) >= UnscaledStOffset && + (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && + "Invalid offset"); + + Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + Imms = Immr + Width - 1; + if (UnscaledLdOffset == UnscaledStOffset) { + uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N + | ((Immr) << 6) // immr + | ((Imms) << 0) // imms + ; + + BitExtMI = + BuildMI(*LoadInst->getParent(), LoadInst, LoadInst->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), + DestReg) + .addReg(StRt) + .addImm(AndMaskEncoded); + } else { + BitExtMI = + BuildMI(*LoadInst->getParent(), LoadInst, LoadInst->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), + DestReg) + .addReg(StRt) + .addImm(Immr) + .addImm(Imms); + } + } + + DEBUG(dbgs() << "Promoting load by replacing :\n "); + DEBUG(StoreInst->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(LoadInst->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG(StoreInst->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + LoadInst->eraseFromParent(); + return NextI; +} + /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, @@ -863,6 +1025,60 @@ return false; } +bool AArch64LoadStoreOpt::findMatchingStore( + MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &Stored) { + MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + + // Track which registers have been modified and used between the first insn + // and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + + for (unsigned Count = 0; MBBI != E && Count < Limit;) { + --MBBI; + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + // Now that we know this is a real instruction, count it. + ++Count; + + // If the load instruction reads directly from the address to which the + // store instruction writes and the stored value is not modified, we can + // promote the load. Since we do not handle stores with pre-/post-index, + // it's unnecessary to check if BaseReg is modified by the store itself. + if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + BaseReg == getLdStBaseOp(MI).getReg() && + isLdOffsetInRangeOfStored(FirstMI, MI) && + !ModifiedRegs[getLdStRegOp(MI).getReg()]) { + Stored = MBBI; + return true; + } + + if (MI->isCall()) + return false; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return false; + + // If we encounter a store aliased with the load, return early. + if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + return false; + } + return false; +} + /// findMatchingInsn - Scan the instructions looking for a load/store that can /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator @@ -1263,6 +1479,31 @@ return E; } +bool AArch64LoadStoreOpt::tryToPromoteLoadFromStored( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + // If this is a volatile load, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm. + // FIXME: It is possible to extend it to handle reg+reg cases. + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Look backward up to ScanLimit instructions. + MachineBasicBlock::iterator Stored; + if (findMatchingStore(MBBI, ScanLimit, Stored)) { + ++NumLoadsFromStoredPromoted; + // Promote the load. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = promoteLoadFromStored(MBBI, Stored); + return true; + } + return false; +} + bool AArch64LoadStoreOpt::tryToMergeLdStInst( MachineBasicBlock::iterator &MBBI) { MachineInstr *MI = MBBI; @@ -1307,7 +1548,16 @@ bool enableNarrowLdOpt) { bool Modified = false; // Three tranformations to do here: - // 1) Find narrow loads that can be converted into a single wider load + // 1) Find loads that directly read from stored and promote them by + // replacing with mov instructions. If the store is wider than the load, + // the load will be replaced with a bitfield extract. + // e.g., + // str w1, [x0, #4] + // ldrh w2, [x0, #6] + // ; becomes + // str w1, [x0, #4] + // lsr w2, w1, #16 + // 2) Find narrow loads that can be converted into a single wider load // with bitfield extract instructions. // e.g., // ldrh w0, [x2] @@ -1316,14 +1566,14 @@ // ldr w0, [x2] // ubfx w1, w0, #16, #16 // and w0, w0, #ffff - // 2) Find loads and stores that can be merged into a single load or store + // 3) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., // ldr x0, [x2] // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] - // 3) Find base register updates that can be merged into the load or store + // 4) Find base register updates that can be merged into the load or store // as a base-reg writeback. // e.g., // ldr x0, [x2] @@ -1332,6 +1582,35 @@ // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDURXi: { + if (tryToPromoteLoadFromStored(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); enableNarrowLdOpt && MBBI != E;) { MachineInstr *MI = MBBI; switch (MI->getOpcode()) { Index: test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll =================================================================== --- test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll +++ test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s ; CHECK: foo -; CHECK: ldr w[[REG:[0-9]+]], [x19, #264] -; CHECK: str w[[REG]], [x19, #132] -; CHECK: ldr w{{[0-9]+}}, [x19, #264] +; CHECK: str w[[REG0:[0-9]+]], [x19, #264] +; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]] +; CHECK: str w[[REG1]], [x19, #132] define i32 @foo(i32 %a) nounwind { %retval = alloca i32, align 4 Index: test/CodeGen/AArch64/arm64-ld-from-st.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-ld-from-st.ll @@ -0,0 +1,680 @@ +; RUN: llc < %s -mtriple aarch64--none-eabi -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: Str64Ldr64 +; CHECK: mov x0, x1 +define i64 @Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i64* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 1 + %1 = load i64, i64* %arrayidx1 + ret i64 %1 +} + +; CHECK-LABEL: Str64Ldr32_0 +; CHECK: and x0, x1, #0xffffffff +define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 2 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Str64Ldr32_1 +; CHECK: lsr x0, x1, #32 +define i32 @Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 3 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Str64Ldr16_0 +; CHECK: and x0, x1, #0xffff +define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 4 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr16_1 +; CHECK: ubfx x0, x1, #16, #16 +define i16 @Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 5 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr16_2 +; CHECK: ubfx x0, x1, #32, #16 +define i16 @Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 6 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr16_3 +; CHECK: lsr x0, x1, #48 +define i16 @Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 7 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr8_0 +; CHECK: and x0, x1, #0xff +define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 8 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_1 +; CHECK: ubfx x0, x1, #8, #8 +define i8 @Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 9 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_2 +; CHECK: ubfx x0, x1, #16, #8 +define i8 @Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 10 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_3 +; CHECK: ubfx x0, x1, #24, #8 +define i8 @Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 11 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_4 +; CHECK: ubfx x0, x1, #32, #8 +define i8 @Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 12 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_5 +; CHECK: ubfx x0, x1, #40, #8 +define i8 @Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 13 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_6 +; CHECK: ubfx x0, x1, #48, #8 +define i8 @Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 14 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_7 +; CHECK: lsr x0, x1, #56 +define i8 @Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 15 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr32 +; CHECK: mov w0, w1 +define i32 @Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i32* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Str32Ldr16_0 +; CHECK: and w0, w1, #0xffff +define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str32Ldr16_1 +; CHECK: lsr w0, w1, #16 +define i16 @Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str32Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 4 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 5 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr8_2 +; CHECK: ubfx w0, w1, #16, #8 +define i8 @Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 6 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr8_3 +; CHECK: lsr w0, w1, #24 +define i8 @Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 7 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str16Ldr16 +; CHECK: mov w0, w1 +define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i16* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str16Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str16Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 3 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + + +; CHECK-LABEL: Unscaled_Str64Ldr64 +; CHECK: mov x0, x1 +define i64 @Unscaled_Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i64* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 -1 + %1 = load i64, i64* %arrayidx1 + ret i64 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr32_0 +; CHECK: and x0, x1, #0xffffffff +define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -2 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr32_1 +; CHECK: lsr x0, x1, #32 +define i32 @Unscaled_Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_0 +; CHECK: and x0, x1, #0xffff +define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -4 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_1 +; CHECK: ubfx x0, x1, #16, #16 +define i16 @Unscaled_Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_2 +; CHECK: ubfx x0, x1, #32, #16 +define i16 @Unscaled_Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_3 +; CHECK: lsr x0, x1, #48 +define i16 @Unscaled_Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_0 +; CHECK: and x0, x1, #0xff +define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -8 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_1 +; CHECK: ubfx x0, x1, #8, #8 +define i8 @Unscaled_Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -7 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_2 +; CHECK: ubfx x0, x1, #16, #8 +define i8 @Unscaled_Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -6 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_3 +; CHECK: ubfx x0, x1, #24, #8 +define i8 @Unscaled_Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -5 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_4 +; CHECK: ubfx x0, x1, #32, #8 +define i8 @Unscaled_Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_5 +; CHECK: ubfx x0, x1, #40, #8 +define i8 @Unscaled_Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_6 +; CHECK: ubfx x0, x1, #48, #8 +define i8 @Unscaled_Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_7 +; CHECK: lsr x0, x1, #56 +define i8 @Unscaled_Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr32 +; CHECK: mov w0, w1 +define i32 @Unscaled_Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i32* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr16_0 +; CHECK: and w0, w1, #0xffff +define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr16_1 +; CHECK: lsr w0, w1, #16 +define i16 @Unscaled_Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Unscaled_Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_2 +; CHECK: ubfx w0, w1, #16, #8 +define i8 @Unscaled_Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_3 +; CHECK: lsr w0, w1, #24 +define i8 @Unscaled_Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str16Ldr16 +; CHECK: mov w0, w1 +define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i16* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str16Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str16Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Unscaled_Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: StrVolatileLdr +; CHECK: ldrh +define i16 @StrVolatileLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2 + %1 = load volatile i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: StrNotInRangeLdr +; CHECK: ldrh +define i16 @StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_StrNotInRangeLdr +; CHECK: ldurh +define i16 @Unscaled_StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: StrCallLdr +; CHECK: ldrh +define i16 @StrCallLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %c = call i1 @test_dummy() + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +declare i1 @test_dummy() + +; CHECK-LABEL: StrDefValLdr +; CHECK: ldrh +define i32 @StrDefValLdr(i32 %v, i32* %P, i16* %P2, i32 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %r = sub i32 %v, %n + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2 + %1 = load i16, i16* %arrayidx1 + store i16 %1, i16* %P2 + ret i32 %r +} + +; CHECK-LABEL: StrStrLdr +; CHECK: ldrh +define i16 @StrStrLdr(i32 %v, i32* %P, i32* %P2, i32 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + store i32 %n, i32* %P2 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} Index: test/CodeGen/AArch64/regress-tblgen-chains.ll =================================================================== --- test/CodeGen/AArch64/regress-tblgen-chains.ll +++ test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -27,8 +27,8 @@ ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]] ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1 -; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]] -; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]] +; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]] +; CHECK: mov {{w[0-9]+}}, w[[STRVAL]] %ret.1 = load i8, i8* %locvar %ret.2 = zext i8 %ret.1 to i64