Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -27,8 +27,11 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -96,11 +99,19 @@ AliasAnalysis *AA; const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; + const TargetSubtargetInfo *STI; const AArch64Subtarget *Subtarget; // Track which registers have been modified and used. BitVector ModifiedRegs, UsedRegs; + // Target has a cost model. + bool HasInstrSchedModel; + TargetSchedModel TSM; + + // Function is being optimized for code size. + bool OptForSize; + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -154,6 +165,10 @@ bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); + // Evaluate if the new instruction is a better choice than the old ones. + bool isProfitableMergeUpdate(unsigned New, + MachineInstr &OldA, MachineInstr &OldB); + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, @@ -651,6 +666,40 @@ } } +bool AArch64LoadStoreOpt::isProfitableMergeUpdate(unsigned New, + MachineInstr &OldA, + MachineInstr &OldB) { + // Default as profitable if optimizing for size or + // in the absence of a cost model. + if (OptForSize || !HasInstrSchedModel) + return true; + + const MCInstrDesc &NewID = TII->get(New), + &OldAID = TII->get(OldA.getOpcode()), + &OldBID = TII->get(OldB.getOpcode()); + const MCSchedClassDesc + *NewSD = TSM.getMCSchedModel()->getSchedClassDesc(NewID.getSchedClass()), + *OldASD = TSM.getMCSchedModel()->getSchedClassDesc(OldAID.getSchedClass()), + *OldBSD = TSM.getMCSchedModel()->getSchedClassDesc(OldBID.getSchedClass()); + // Default as profitable in the absence of a valid cost model or when variant. + if (NewSD->isVariant() || !NewSD->isValid() || + !OldASD->isValid() || !OldBSD->isValid()) + return true; + + unsigned NewLat = TSM.computeInstrLatency(New), + OldALat = TSM.computeInstrLatency(&OldA), + OldBLat = TSM.computeInstrLatency(&OldB); + // It is profitable if the new instruction is faster than both old ones. + if (NewLat < (OldALat + OldBLat)) + return true; + // It is profitable if the new instruction uses fewer uops than both old ones. + else if (NewLat == (OldALat + OldBLat)) + return (NewSD->NumMicroOps < (OldASD->NumMicroOps + OldBSD->NumMicroOps)); + + // It is not profitable. + return false; +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MergeMI, @@ -1345,6 +1394,11 @@ unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); + + // Evaluate if the new instruction is a better choice than both old ones. + if (!isProfitableMergeUpdate(NewOpc, *I, *Update)) + return NextI; + MachineInstrBuilder MIB; if (!isPairedLdSt(*I)) { // Non-paired instruction. @@ -1768,6 +1822,13 @@ TRI = Subtarget->getRegisterInfo(); AA = &getAnalysis().getAAResults(); + + OptForSize = Fn.getFunction()->optForSize(); + + const TargetSubtargetInfo &STI = Fn.getSubtarget(); + TSM.init(STI.getSchedModel(), &STI, STI.getInstrInfo()); + HasInstrSchedModel = TSM.hasInstrSchedModel(); + // Resize the modified and used register bitfield trackers. We do this once // per function and then clear the bitfield each time we optimize a load or // store. Index: llvm/test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- llvm/test/CodeGen/AArch64/ldst-opt.ll +++ llvm/test/CodeGen/AArch64/ldst-opt.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTRICTALIGN %s -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=STRICTALIGN %s +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,NOSTRICTALIGN +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mattr=+strict-align -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,STRICTALIGN +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1 -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS ; This file contains tests for the AArch64 load/store optimizer. @@ -7,8 +8,8 @@ %s.byte = type { i8, i8 } %s.halfword = type { i16, i16 } %s.word = type { i32, i32 } -%s.doubleword = type { i64, i32 } -%s.quadword = type { fp128, i32 } +%s.doubleword = type { i64, i64 } +%s.quadword = type { fp128, fp128 } %s.float = type { float, i32 } %s.double = type { double, i32 } %struct.byte = type { %padding, %s.byte } @@ -267,6 +268,39 @@ ret void } +define void @load-pair-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind { +; CHECK-LABEL: load-pair-pre-indexed-doubleword +; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]! +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0 + %a1 = load i64, i64* %a, align 8 + %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1 + %b1 = load i64, i64* %b, align 8 + %add = add i64 %a1, %b1 + br label %bar +bar: + %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1 + tail call void @bar_doubleword(%s.doubleword* %c, i64 %add) + ret void +} + +define void @store-pair-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind { +; CHECK-LABEL: store-pair-pre-indexed-doubleword +; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]! +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0 + store i64 %val, i64* %a, align 8 + %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1 + store i64 %val, i64* %b, align 8 + br label %bar +bar: + %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1 + tail call void @bar_doubleword(%s.doubleword* %c, i64 %val) + ret void +} + ; Check the following transform: ; ; add x8, x8, #16 @@ -1031,7 +1065,6 @@ define void @store-pair-post-indexed-word() nounwind { ; CHECK-LABEL: store-pair-post-indexed-word ; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16 -; CHECK: ret %src = alloca { i32, i32 }, align 8 %dst = alloca { i32, i32 }, align 8 @@ -1050,7 +1083,6 @@ define void @store-pair-post-indexed-doubleword() nounwind { ; CHECK-LABEL: store-pair-post-indexed-doubleword ; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32 -; CHECK: ret %src = alloca { i64, i64 }, align 8 %dst = alloca { i64, i64 }, align 8 @@ -1069,7 +1101,6 @@ define void @store-pair-post-indexed-float() nounwind { ; CHECK-LABEL: store-pair-post-indexed-float ; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16 -; CHECK: ret %src = alloca { float, float }, align 8 %dst = alloca { float, float }, align 8 @@ -1088,7 +1119,6 @@ define void @store-pair-post-indexed-double() nounwind { ; CHECK-LABEL: store-pair-post-indexed-double ; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32 -; CHECK: ret %src = alloca { double, double }, align 8 %dst = alloca { double, double }, align 8 @@ -1104,6 +1134,27 @@ ret void } +define void @store-pair-post-indexed-quadword() nounwind { +; CHECK-LABEL: store-pair-post-indexed-quadword +; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp], #64 +; EXYNOS: str q{{[0-9]+}}, [sp] +; EXYNOS-NEXT: str q{{[0-9]+}}, [sp, #16] +; EXYNOS-NEXT: add sp, sp, #64 + %src = alloca { fp128, fp128 }, align 8 + %dst = alloca { fp128, fp128 }, align 8 + + %src.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 0 + %src.real = load fp128, fp128* %src.realp + %src.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 1 + %src.imag = load fp128, fp128* %src.imagp + + %dst.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 1 + store fp128 %src.real, fp128* %dst.realp + store fp128 %src.imag, fp128* %dst.imagp + ret void +} + ; Check the following transform: ; ; (ldr|str) X, [x20] @@ -1343,6 +1394,7 @@ ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store i32 0, i32* %p @@ -1358,6 +1410,7 @@ ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store i32 0, i32* %p @@ -1379,6 +1432,7 @@ ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #508] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #512] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #516] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504] ; CHECK-NEXT: ret entry: %p0 = getelementptr i32, i32* %p, i32 126 @@ -1404,6 +1458,8 @@ ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4100] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4104] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4108] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096] ; CHECK-NEXT: ret entry: %p0 = getelementptr i32, i32* %p, i32 1024 @@ -1429,6 +1485,9 @@ ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #16] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #24] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] ; CHECK-NEXT: ret entry: store i32 0, i32* %p @@ -1455,6 +1514,7 @@ ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <2 x i32> zeroinitializer, <2 x i32>* %p @@ -1469,6 +1529,8 @@ ; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str wzr, [x{{[0-9]+}}, #8] ; CHECK-NEXT: ret entry: store <3 x i32> zeroinitializer, <3 x i32>* %p @@ -1482,6 +1544,7 @@ ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x i32> zeroinitializer, <4 x i32>* %p @@ -1494,6 +1557,7 @@ ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <2 x float> zeroinitializer, <2 x float>* %p @@ -1507,6 +1571,7 @@ ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x float> zeroinitializer, <4 x float>* %p @@ -1547,6 +1612,7 @@ ; STRICTALIGN: strb ; STRICTALIGN: strb ; STRICTALIGN: strb +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <2 x i64> zeroinitializer, <2 x i64>* %p, align 1 @@ -1562,6 +1628,9 @@ ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #16] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] ; CHECK-NEXT: ret entry: store i64 0, i64* %p @@ -1601,8 +1670,11 @@ define void @merge_zr64_4vecd(<4 x double>* %p) { ; CHECK-LABEL: merge_zr64_4vecd: ; CHECK: // %entry -; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 -; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; GENERIC-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; GENERIC-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x double> zeroinitializer, <4 x double>* %p @@ -1620,6 +1692,10 @@ ; STRICTALIGN-NEXT: stp xzr, xzr, [x0] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #24] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q0, [x0] +; EXYNOS-NEXT: stur q0, [x0, #24] +; EXYNOS-NEXT: str q0, [x0, #48] ; CHECK-NEXT: ret entry: store i64 0, i64* %p @@ -1647,6 +1723,11 @@ ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #16] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #32] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #32] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #48] ; CHECK-NEXT: ret entry: store i64 0, i64* %p @@ -1670,7 +1751,7 @@ ; Check for bug 34674 where invalid add of xzr was being generated. ; CHECK-LABEL: bug34674: ; CHECK: // %entry -; CHECK-NEXT: mov [[ZREG:x[0-9]+]], xzr +; CHECK-NEXT: mov [[ZREG:x[0-9]+]], {{#0|xzr}} ; CHECK-DAG: stp [[ZREG]], [[ZREG]], [x0] ; CHECK-DAG: add x{{[0-9]+}}, [[ZREG]], #1 define i64 @bug34674(<2 x i64>* %p) {