Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -105,6 +105,10 @@ "fuse-aes", "HasFuseAES", "true", "CPU fuses AES crypto operations">; +def FeatureFuseLiterals : SubtargetFeature< + "fuse-literals", "HasFuseLiterals", "true", + "CPU fuses literal generation operations">; + def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; @@ -181,6 +185,7 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [ FeatureFuseAES, + FeatureFuseLiterals, FeatureBalanceFPOps, FeatureCRC, FeatureCrypto, Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1989,6 +1989,20 @@ (FirstOpcode == AArch64::AESDrr && SecondOpcode == AArch64::AESIMCrr)) return true; + if (Subtarget.hasFuseLiterals()) + // Fuse literal generation operations: + // - PC relative addresses + // - large immediates + if ((FirstOpcode == AArch64::ADRP && SecondOpcode == AArch64::ADDXri) || + ((FirstOpcode == AArch64::MOVZWi && SecondOpcode == AArch64::MOVKWi && + Second.getOperand(3).getImm() == 16) || + (FirstOpcode == AArch64::MOVZXi && SecondOpcode == AArch64::MOVKXi && + Second.getOperand(3).getImm() == 16) || + (FirstOpcode == AArch64::MOVKXi && SecondOpcode == AArch64::MOVKXi && + First.getOperand(3).getImm() == 32 && + Second.getOperand(3).getImm() == 48))) + return true; + return false; } Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -84,6 +84,7 @@ bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; bool HasFuseAES = false; + bool HasFuseLiterals = false; bool DisableLatencySchedHeuristic = false; bool UseRSqrt = false; uint8_t MaxInterleaveFactor = 2; @@ -197,6 +198,7 @@ bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } bool hasFuseAES() const { return HasFuseAES; } + bool hasFuseLiterals() const { return HasFuseLiterals; } bool useRSqrt() const { return UseRSqrt; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -327,6 +327,20 @@ return DAG; } + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + const AArch64Subtarget &ST = C->MF->getSubtarget(); + if (ST.hasFuseLiterals()) { + // Run the Macro Fusion after RA again since literals are expanded from + // pseudos then (v. addPreSched2()). + ScheduleDAGMI *DAG = createGenericSchedPostRA(C); + DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + return DAG; + } + + return nullptr; + } + void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; Index: llvm/test/CodeGen/AArch64/misched-fusion-lit.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/misched-fusion-lit.ll @@ -0,0 +1,46 @@ +; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=-fuse-literals | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKDONT +; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-literals | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE + +@g = common local_unnamed_addr global i8* null, align 8 + +define i8* @litp(i32 %a, i32 %b) { +entry: + %add = add nsw i32 %b, %a + %idx.ext = sext i32 %add to i64 + %add.ptr = getelementptr i8, i8* bitcast (i8* (i32, i32)* @litp to i8*), i64 %idx.ext + store i8* %add.ptr, i8** @g, align 8 + ret i8* %add.ptr + +; CHECK-LABEL: litp: +; CHECK: adrp [[R:x[0-9]+]], litp +; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECKFUSE-NEXT: add {{x[0-9]+}}, [[R]], :lo12:litp +} + +define i32 @liti(i32 %a, i32 %b) { +entry: + %add = add i32 %a, -262095121 + %add1 = add i32 %add, %b + ret i32 %add1 + +; CHECK-LABEL: liti: +; CHECK: mov [[R:w[0-9]+]], {{#[0-9]+}} +; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @litl(i64 %a, i64 %b) { +entry: + %add = add i64 %a, 2208998440489107183 + %add1 = add i64 %add, %b + ret i64 %add1 + +; CHECK-LABEL: litl: +; CHECK: mov [[R:x[0-9]+]], {{#[0-9]+}} +; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16 +; CHECK: movk [[R]], {{#[0-9]+}}, lsl #32 +; CHECKDONT-NEXT: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #48 +}