Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -220,14 +220,9 @@ "enable-select-opt", "EnableSelectOptimize", "true", "Enable the select optimize pass for select loop heuristics">; -def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", - "HasCustomCheapAsMoveHandling", "true", - "Use custom handling of cheap instructions">; - def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move", "HasExynosCheapAsMoveHandling", "true", - "Use Exynos specific handling of cheap instructions", - [FeatureCustomCheapAsMoveHandling]>; + "Use Exynos specific handling of cheap instructions">; def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; @@ -775,7 +770,6 @@ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureBalanceFPOps, - FeatureCustomCheapAsMoveHandling, FeaturePostRAScheduler]>; def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", @@ -796,7 +790,6 @@ "Cortex-A57 ARM processors", [ FeatureFuseAES, FeatureBalanceFPOps, - FeatureCustomCheapAsMoveHandling, FeatureFuseAdrpAdd, FeatureFuseLiterals, FeaturePostRAScheduler, @@ -1080,7 +1073,6 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", "Qualcomm Kryo processors", [ - FeatureCustomCheapAsMoveHandling, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, @@ -1089,7 +1081,6 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", "Qualcomm Falkor processors", [ - FeatureCustomCheapAsMoveHandling, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, @@ -1149,7 +1140,6 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ - FeatureCustomCheapAsMoveHandling, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, @@ -1197,7 +1187,6 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110", "HiSilicon TS-V110 processors", [ - FeatureCustomCheapAsMoveHandling, FeatureFuseAES, FeaturePostRAScheduler]>; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -798,85 +798,48 @@ return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); } -// FIXME: this implementation should be micro-architecture dependent, so a -// micro-architecture target hook should be introduced here in future. -bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { - if (!Subtarget.hasCustomCheapAsMoveHandling()) - return MI.isAsCheapAsAMove(); - - const unsigned Opcode = MI.getOpcode(); - - // Firstly, check cases gated by features. +// Return true if Imm can be loaded into a register by a "cheap" sequence of +// MOVZ and MOVK. For now, "cheap" means at most two instructions. +LLVM_ATTRIBUTE_ALWAYS_INLINE static bool isMOVZKImm(uint64_t Imm, + unsigned BitSize) { + if (BitSize == 32) + return true; - if (Subtarget.hasZeroCycleZeroingFP()) { - if (Opcode == AArch64::FMOVH0 || - Opcode == AArch64::FMOVS0 || - Opcode == AArch64::FMOVD0) - return true; - } + assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed"); + unsigned n = !!(Imm & 0xffff) + !!(Imm & 0xffff0000) + + !!(Imm & 0xffff00000000) + !!(Imm & 0xffff000000000000); - if (Subtarget.hasZeroCycleZeroingGP()) { - if (Opcode == TargetOpcode::COPY && - (MI.getOperand(1).getReg() == AArch64::WZR || - MI.getOperand(1).getReg() == AArch64::XZR)) - return true; - } + return n <= 2; +} - // Secondly, check cases specific to sub-targets. +/// Returns true if a MOVi32imm or MOVi64imm can be expanded to a "cheap" +/// sequence of MOVZ/MOVN/MOVK. +LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +canBeExpandedToMOVZNK(const MachineInstr &MI, unsigned BitSize) { + uint64_t Imm = static_cast(MI.getOperand(1).getImm()); + return isMOVZKImm(Imm, BitSize) || isMOVZKImm(~Imm, BitSize); +} +// FIXME: this implementation should be micro-architecture dependent, so a +// micro-architecture target hook should be introduced here in future. +bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { if (Subtarget.hasExynosCheapAsMoveHandling()) { if (isExynosCheapAsMove(MI)) return true; - return MI.isAsCheapAsAMove(); } - // Finally, check generic cases. - - switch (Opcode) { + switch (MI.getOpcode()) { default: - return false; - - // add/sub on register without shift - case AArch64::ADDWri: - case AArch64::ADDXri: - case AArch64::SUBWri: - case AArch64::SUBXri: - return (MI.getOperand(3).getImm() == 0); - - // logical ops on immediate - case AArch64::ANDWri: - case AArch64::ANDXri: - case AArch64::EORWri: - case AArch64::EORXri: - case AArch64::ORRWri: - case AArch64::ORRXri: - return true; - - // logical ops on register without shift - case AArch64::ANDWrr: - case AArch64::ANDXrr: - case AArch64::BICWrr: - case AArch64::BICXrr: - case AArch64::EONWrr: - case AArch64::EONXrr: - case AArch64::EORWrr: - case AArch64::EORXrr: - case AArch64::ORNWrr: - case AArch64::ORNXrr: - case AArch64::ORRWrr: - case AArch64::ORRXrr: - return true; - + return MI.isAsCheapAsAMove(); // If MOVi32imm or MOVi64imm can be expanded into ORRWri or - // ORRXri, it is as cheap as MOV + // ORRXri, it is as cheap as MOV. + // Likewise if it can be expanded to MOVZ/MOVN/MOVK. case AArch64::MOVi32imm: - return canBeExpandedToORR(MI, 32); + return canBeExpandedToMOVZNK(MI, 32) || canBeExpandedToORR(MI, 32); case AArch64::MOVi64imm: - return canBeExpandedToORR(MI, 64); + return canBeExpandedToMOVZNK(MI, 64) || canBeExpandedToORR(MI, 64); } - - llvm_unreachable("Unknown opcode to check as cheap as a move!"); } bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { Index: llvm/test/CodeGen/AArch64/cheap-as-a-move.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/cheap-as-a-move.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux" + +; Check an "expensive" construction of a constant is hoisted out of a loop +define void @f0(ptr %a, i64 %n) { +; CHECK-LABEL: f0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w30, -48 +; CHECK-NEXT: mov x21, #1 // =0x1 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: movk x21, #22136, lsl #16 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: movk x21, #4660, lsl #48 +; CHECK-NEXT: cmp x22, x19 +; CHECK-NEXT: b.ge .LBB0_2 +; CHECK-NEXT: .LBB0_1: // %loop.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lsl x23, x22, #2 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: ldr w0, [x20, x23] +; CHECK-NEXT: bl g +; CHECK-NEXT: add x22, x22, #1 +; CHECK-NEXT: str w0, [x20, x23] +; CHECK-NEXT: cmp x22, x19 +; CHECK-NEXT: b.lt .LBB0_1 +; CHECK-NEXT: .LBB0_2: // %exit +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %i = phi i64 [0, %entry], [%i.next, %loop.body] + %c = icmp slt i64 %i, %n + br i1 %c, label %loop.body, label %exit + +loop.body: + %p = getelementptr i32, ptr %a, i64 %i + %v = load i32, ptr %p + %w = call i32 @g(i32 %v, i64 1311673392922361857) ; 0x1234000056780001 + store i32 %w, ptr %p + %i.next = add i64 %i, 1 + br label %loop + +exit: + ret void +} + +; Check a "cheap" to construct constant is materialised inside a loop. +define void @f1(ptr %a, i64 %n) { +; CHECK-LABEL: f1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w30, -48 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: mov x21, xzr +; CHECK-NEXT: cmp x21, x19 +; CHECK-NEXT: b.ge .LBB1_2 +; CHECK-NEXT: .LBB1_1: // %loop.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lsl x22, x21, #2 +; CHECK-NEXT: mov x1, #1450704896 // =0x56780000 +; CHECK-NEXT: movk x1, #4660, lsl #48 +; CHECK-NEXT: ldr w0, [x20, x22] +; CHECK-NEXT: bl g +; CHECK-NEXT: add x21, x21, #1 +; CHECK-NEXT: str w0, [x20, x22] +; CHECK-NEXT: cmp x21, x19 +; CHECK-NEXT: b.lt .LBB1_1 +; CHECK-NEXT: .LBB1_2: // %exit +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %i = phi i64 [0, %entry], [%i.next, %loop.body] + %c = icmp slt i64 %i, %n + br i1 %c, label %loop.body, label %exit + +loop.body: + %p = getelementptr i32, ptr %a, i64 %i + %v = load i32, ptr %p + %w = call i32 @g(i32 %v, i64 1311673392922361856) ; 0x1234000056780000 + store i32 %w, ptr %p + %i.next = add i64 %i, 1 + br label %loop + +exit: + ret void +} + +declare i32 @g(i32, i64)