diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -904,6 +904,13 @@ switch (MI.getOpcode()) { default: return MI.isAsCheapAsAMove(); + + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4; + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or // ORRXri, it is as cheap as MOV. // Likewise if it can be expanded to MOVZ/MOVN/MOVK. diff --git a/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll b/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc -mattr=+alu-lsl-fast < %s -o - | FileCheck %s -check-prefix=LSLFAST +target triple = "aarch64-linux" + +declare void @g(...) + +; Check that ADDWrs/ADDXrs with shift > 4 is considered relatively +; slow, thus CSE-d. +define void @f0(i1 %c0, i1 %c1, ptr %a, i64 %i) { +; CHECK-LABEL: f0: +; CHECK: // %bb.0: // %E +; CHECK-NEXT: tbz w0, #0, .LBB0_5 +; CHECK-NEXT: // %bb.1: // %A +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, x2, x3, lsl #5 +; CHECK-NEXT: tbz w1, #0, .LBB0_3 +; CHECK-NEXT: // %bb.2: // %B +; CHECK-NEXT: bl g +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_3: // %C +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: bl g +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .LBB0_5: // %X +; CHECK-NEXT: ret +; +; LSLFAST-LABEL: f0: +; LSLFAST: // %bb.0: // %E +; LSLFAST-NEXT: tbz w0, #0, .LBB0_5 +; LSLFAST-NEXT: // %bb.1: // %A +; LSLFAST-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; LSLFAST-NEXT: .cfi_def_cfa_offset 16 +; LSLFAST-NEXT: .cfi_offset w30, -16 +; LSLFAST-NEXT: add x0, x2, x3, lsl #5 +; LSLFAST-NEXT: tbz w1, #0, .LBB0_3 +; LSLFAST-NEXT: // %bb.2: // %B +; LSLFAST-NEXT: bl g +; LSLFAST-NEXT: b .LBB0_4 +; LSLFAST-NEXT: .LBB0_3: // %C +; LSLFAST-NEXT: mov x1, x0 +; LSLFAST-NEXT: bl g +; LSLFAST-NEXT: .LBB0_4: +; LSLFAST-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; LSLFAST-NEXT: .LBB0_5: // %X +; LSLFAST-NEXT: ret +E: + %p0 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i + br i1 %c0, label %A, label %X + +A: + br i1 %c1, label %B, label %C + +B: + call void @g(ptr %p0) + br label %X + +C: + %p1 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i + call void @g(ptr %p1, ptr %p0) + br label %X + +X: + ret void +} + +; Check that ADDWrs/ADDXrs with shift <= 4 is considered relatively fast on sub-targets +; with feature +alu-lsl-fast, thus *not* CSE-d. +define void @f1(i1 %c0, i1 %c1, ptr %a, i64 %i) { +; CHECK-LABEL: f1: +; CHECK: // %bb.0: // %E +; CHECK-NEXT: tbz w0, #0, .LBB1_5 +; CHECK-NEXT: // %bb.1: // %A +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, x2, x3, lsl #4 +; CHECK-NEXT: tbz w1, #0, .LBB1_3 +; CHECK-NEXT: // %bb.2: // %B +; CHECK-NEXT: bl g +; CHECK-NEXT: b .LBB1_4 +; CHECK-NEXT: .LBB1_3: // %C +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: bl g +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .LBB1_5: // %X +; CHECK-NEXT: ret +; +; LSLFAST-LABEL: f1: +; LSLFAST: // %bb.0: // %E +; LSLFAST-NEXT: tbz w0, #0, .LBB1_5 +; LSLFAST-NEXT: // %bb.1: // %A +; LSLFAST-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; LSLFAST-NEXT: .cfi_def_cfa_offset 16 +; LSLFAST-NEXT: .cfi_offset w30, -16 +; LSLFAST-NEXT: add x8, x2, x3, lsl #4 +; LSLFAST-NEXT: tbz w1, #0, .LBB1_3 +; LSLFAST-NEXT: // %bb.2: // %B +; LSLFAST-NEXT: mov x0, x8 +; LSLFAST-NEXT: bl g +; LSLFAST-NEXT: b .LBB1_4 +; LSLFAST-NEXT: .LBB1_3: // %C +; LSLFAST-NEXT: add x0, x2, x3, lsl #4 +; LSLFAST-NEXT: mov x1, x8 +; LSLFAST-NEXT: bl g +; LSLFAST-NEXT: .LBB1_4: +; LSLFAST-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; LSLFAST-NEXT: .LBB1_5: // %X +; LSLFAST-NEXT: ret +E: + %p0 = getelementptr {i64, i64}, ptr %a, i64 %i + br i1 %c0, label %A, label %X + +A: + br i1 %c1, label %B, label %C + +B: + call void @g(ptr %p0) + br label %X + +C: + %p1 = getelementptr {i64, i64}, ptr %a, i64 %i + call void @g(ptr %p1, ptr %p0) + br label %X + +X: + ret void +}