Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -78,13 +78,12 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; - AArch64LoadStoreOpt() : MachineFunctionPass(ID), IsStrictAlign(false) { + AArch64LoadStoreOpt() : MachineFunctionPass(ID) { initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); } const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; - bool IsStrictAlign; // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. @@ -127,7 +126,11 @@ // Find and merge foldable ldr/str instructions. bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); - bool optimizeBlock(MachineBasicBlock &MBB); + // Check if converting two narrow loads into a single wider load with + // bitfield extracts could be enabled. + bool enableNarrowLdMerge(MachineFunction &Fn); + + bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -1161,7 +1164,8 @@ return false; } -bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, + bool enableNarrowLdOpt) { bool Modified = false; // Three tranformations to do here: // 1) Find halfword loads that can be merged into a single 32-bit word load @@ -1189,7 +1193,7 @@ // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - !IsStrictAlign && MBBI != E;) { + enableNarrowLdOpt && MBBI != E;) { MachineInstr *MI = MBBI; switch (MI->getOpcode()) { default: @@ -1372,15 +1376,25 @@ return Modified; } +bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { + const AArch64Subtarget *SubTarget = + &static_cast(Fn.getSubtarget()); + bool ProfitableArch = SubTarget->isCortexA57(); + // FIXME: The benefit from converting narrow loads into a wider loads could be + // microarchitectural as it assumes that a single load with two bitfield + // extracts is cheaper than two narrow loads. Currently, this conversion is + // enabled only in cortex-a57 on which performance benefits were verified. + return ProfitableArch & (!SubTarget->requiresStrictAlign()); +} + bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast(Fn.getSubtarget().getInstrInfo()); TRI = Fn.getSubtarget().getRegisterInfo(); - IsStrictAlign = (static_cast(Fn.getSubtarget())) - .requiresStrictAlign(); bool Modified = false; + bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MBB, enableNarrowLdOpt); return Modified; } Index: test/CodeGen/AArch64/arm64-ldp.ll =================================================================== --- test/CodeGen/AArch64/arm64-ldp.ll +++ test/CodeGen/AArch64/arm64-ldp.ll @@ -356,51 +356,3 @@ ret i64 %add } -; CHECK-LABEL: Ldrh_merge -; CHECK-NOT: ldrh -; CHECK: ldr [[NEW_DEST:w[0-9]+]] -; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff -; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]] - -define i16 @Ldrh_merge(i16* nocapture readonly %p) { - %1 = load i16, i16* %p, align 2 - ;%conv = zext i16 %0 to i32 - %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1 - %2 = load i16, i16* %arrayidx2, align 2 - %add = add nuw nsw i16 %1, %2 - ret i16 %add -} - -; CHECK-LABEL: Ldurh_merge -; CHECK-NOT: ldurh -; CHECK: ldur [[NEW_DEST:w[0-9]+]] -; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff -; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]] -define i16 @Ldurh_merge(i16* nocapture readonly %p) { -entry: - %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2 - %0 = load i16, i16* %arrayidx - %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1 - %1 = load i16, i16* %arrayidx3 - %add = add nuw nsw i16 %0, %1 - ret i16 %add -} - -; CHECK-LABEL: Ldrh_4_merge -; CHECK-NOT: ldrh -; CHECK: ldp [[NEW_DEST:w[0-9]+]] -define i16 @Ldrh_4_merge(i16* nocapture readonly %P) { - %arrayidx = getelementptr inbounds i16, i16* %P, i64 0 - %l0 = load i16, i16* %arrayidx - %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1 - %l1 = load i16, i16* %arrayidx2 - %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2 - %l2 = load i16, i16* %arrayidx7 - %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3 - %l3 = load i16, i16* %arrayidx12 - %add4 = add nuw nsw i16 %l1, %l0 - %add9 = add nuw nsw i16 %add4, %l2 - %add14 = add nuw nsw i16 %add9, %l3 - - ret i16 %add14 -} Index: test/CodeGen/AArch64/arm64-ldr-merge.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-ldr-merge.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: Ldrh_merge +; CHECK-NOT: ldrh +; CHECK: ldr [[NEW_DEST:w[0-9]+]] +; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff +; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]] +define i16 @Ldrh_merge(i16* nocapture readonly %p) { + %1 = load i16, i16* %p, align 2 + %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1 + %2 = load i16, i16* %arrayidx2, align 2 + %add = add nuw nsw i16 %1, %2 + ret i16 %add +} + +; CHECK-LABEL: Ldurh_merge +; CHECK-NOT: ldurh +; CHECK: ldur [[NEW_DEST:w[0-9]+]] +; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff +; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]] +define i16 @Ldurh_merge(i16* nocapture readonly %p) { +entry: + %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2 + %0 = load i16, i16* %arrayidx + %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1 + %1 = load i16, i16* %arrayidx3 + %add = add nuw nsw i16 %0, %1 + ret i16 %add +} + +; CHECK-LABEL: Ldrh_4_merge +; CHECK-NOT: ldrh +; CHECK: ldp [[NEW_DEST:w[0-9]+]] +define i16 @Ldrh_4_merge(i16* nocapture readonly %P) { + %arrayidx = getelementptr inbounds i16, i16* %P, i64 0 + %l0 = load i16, i16* %arrayidx + %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1 + %l1 = load i16, i16* %arrayidx2 + %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2 + %l2 = load i16, i16* %arrayidx7 + %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3 + %l3 = load i16, i16* %arrayidx12 + %add4 = add nuw nsw i16 %l1, %l0 + %add9 = add nuw nsw i16 %add4, %l2 + %add14 = add nuw nsw i16 %add9, %l3 + ret i16 %add14 +}