Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -15,6 +15,16 @@ // later. In this case, we could try to split the constant operand of mov // instruction into two bitmask immediates. It makes two AND instructions // intead of multiple `mov` + `and` instructions. +// +// 2. Remove redundant ORRWrs which is generated by zero-extend. +// +// %3:gpr32 = ORRWrs $wzr, %2, 0 +// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 +// +// If AArch64's 32-bit form of instruction defines the source operand of +// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source +// operand are set to zero. +// //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -44,6 +54,8 @@ template bool visitAND(MachineInstr &MI, SmallSetVector &ToBeRemoved); + bool visitORR(MachineInstr &MI, + SmallSetVector &ToBeRemoved); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -196,6 +208,49 @@ return true; } +bool AArch64MIPeepholeOpt::visitORR( + MachineInstr &MI, SmallSetVector &ToBeRemoved) { + // Check this ORR comes from below zero-extend pattern. + // + // def : Pat<(i64 (zext GPR32:$src)), + // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; + if (MI.getOperand(3).getImm() != 0) + return false; + + if (MI.getOperand(1).getReg() != AArch64::WZR) + return false; + + MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + if (!SrcMI) + return false; + + // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC + // + // When you use the 32-bit form of an instruction, the upper 32 bits of the + // source registers are ignored and the upper 32 bits of the destination + // register are set to zero. + // + // If AArch64's 32-bit form of instruction defines the source operand of + // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is + // real AArch64 instruction and if it is not, do not process the opcode + // conservatively. + if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) + return false; + + Register DefReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); + MRI->replaceRegWith(DefReg, SrcReg); + MRI->clearKillFlags(SrcReg); + // replaceRegWith changes MI's definition register. Keep it for SSA form until + // deleting MI. + MI.getOperand(0).setReg(DefReg); + ToBeRemoved.insert(&MI); + + LLVM_DEBUG({ dbgs() << "Removed: " << MI << "\n"; }); + + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -221,6 +276,8 @@ case AArch64::ANDXrr: Changed = visitAND(MI, ToBeRemoved); break; + case AArch64::ORRWrs: + Changed = visitORR(MI, ToBeRemoved); } } } Index: llvm/test/CodeGen/AArch64/arm64-assert-zext-sext.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-assert-zext-sext.ll +++ llvm/test/CodeGen/AArch64/arm64-assert-zext-sext.ll @@ -1,9 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O2 -mtriple=aarch64-linux-gnu < %s | FileCheck %s declare i32 @test(i32) local_unnamed_addr declare i32 @test1(i64) local_unnamed_addr define i32 @assertzext(i32 %n, i1 %a, i32* %b) local_unnamed_addr { +; CHECK-LABEL: assertzext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov w8, #33066 +; CHECK-NEXT: tst w1, #0x1 +; CHECK-NEXT: movk w8, #28567, lsl #16 +; CHECK-NEXT: csel w19, wzr, w8, ne +; CHECK-NEXT: cbnz w0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: mov w19, wzr +; CHECK-NEXT: str wzr, [x2] +; CHECK-NEXT: .LBB0_2: // %if.end +; CHECK-NEXT: mov w0, w19 +; CHECK-NEXT: bl test +; CHECK-NEXT: mov w0, w19 +; CHECK-NEXT: bl test1 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret entry: %i = select i1 %a, i64 0, i64 66296709418 %conv.i = trunc i64 %i to i32 @@ -20,14 +43,29 @@ %i2 = sext i32 %i1 to i64 %call1.i = tail call i32 @test1(i64 %i2) ret i32 0 -; CHECK: // %if.end -; CHECK: mov w{{[0-9]+}}, w{{[0-9]+}} -; CHECK: bl test -; CHECK: mov w{{[0-9]+}}, w{{[0-9]+}} -; CHECK: bl test1 } define i32 @assertsext(i32 %n, i8 %a) local_unnamed_addr { +; CHECK-LABEL: assertsext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz w0, .LBB1_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: b .LBB1_3 +; CHECK-NEXT: .LBB1_2: // %if.then +; CHECK-NEXT: mov x9, #24575 +; CHECK-NEXT: sxtb w8, w1 +; CHECK-NEXT: movk x9, #15873, lsl #16 +; CHECK-NEXT: movk x9, #474, lsl #32 +; CHECK-NEXT: udiv x0, x9, x8 +; CHECK-NEXT: .LBB1_3: // %if.end +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl test1 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret entry: %conv.i = sext i8 %a to i32 %cmp = icmp eq i32 %n, 0 @@ -37,9 +75,6 @@ %conv1 = zext i32 %conv.i to i64 %div = udiv i64 2036854775807, %conv1 br label %if.end -; CHECK: // %if.then -; CHECK: mov w{{[0-9]+}}, w{{[0-9]+}} -; CHECK: udiv x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}} if.end: ; preds = %if.then, %entry %i1 = phi i64 [ %div, %if.then ], [ 0, %entry ] Index: llvm/test/CodeGen/AArch64/redundant-mov-from-zero-extend.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/redundant-mov-from-zero-extend.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define i32 @test(i32 %input, i32 %n, i32 %a) { +; CHECK-LABEL: test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz w1, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %bb.0 +; CHECK-NEXT: add w8, w0, w1 +; CHECK-NEXT: mov w0, #100 +; CHECK-NEXT: cmp w8, #4 +; CHECK-NEXT: b.hi .LBB0_5 +; CHECK-NEXT: // %bb.3: // %bb.0 +; CHECK-NEXT: adrp x9, .LJTI0_0 +; CHECK-NEXT: add x9, x9, :lo12:.LJTI0_0 +; CHECK-NEXT: adr x10, .LBB0_4 +; CHECK-NEXT: ldrb w11, [x9, x8] +; CHECK-NEXT: add x10, x10, x11, lsl #2 +; CHECK-NEXT: br x10 +; CHECK-NEXT: .LBB0_4: // %sw.bb +; CHECK-NEXT: add w0, w2, #1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_5: // %bb.0 +; CHECK-NEXT: cmp w8, #200 +; CHECK-NEXT: b.ne .LBB0_10 +; CHECK-NEXT: // %bb.6: // %sw.bb7 +; CHECK-NEXT: add w0, w2, #7 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_7: // %sw.bb1 +; CHECK-NEXT: add w0, w2, #3 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_8: // %sw.bb3 +; CHECK-NEXT: add w0, w2, #4 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_9: // %sw.bb5 +; CHECK-NEXT: add w0, w2, #5 +; CHECK-NEXT: .LBB0_10: // %return +; CHECK-NEXT: ret +entry: + %b = add nsw i32 %input, %n + %cmp = icmp eq i32 %n, 0 + br i1 %cmp, label %bb.0, label %return + +bb.0: + switch i32 %b, label %return [ + i32 0, label %sw.bb + i32 1, label %sw.bb1 + i32 2, label %sw.bb3 + i32 4, label %sw.bb5 + i32 200, label %sw.bb7 + ] + +sw.bb: + %add = add nsw i32 %a, 1 + br label %return + +sw.bb1: + %add2 = add nsw i32 %a, 3 + br label %return + +sw.bb3: + %add4 = add nsw i32 %a, 4 + br label %return + +sw.bb5: + %add6 = add nsw i32 %a, 5 + br label %return + +sw.bb7: + %add8 = add nsw i32 %a, 7 + br label %return + +return: + %retval.0 = phi i32 [ %add8, %sw.bb7 ], [ %add6, %sw.bb5 ], [ %add4, %sw.bb3 ], [ %add2, %sw.bb1 ], [ %add, %sw.bb ], [ 100, %bb.0 ], [ 0, %entry ] + ret i32 %retval.0 +} Index: llvm/test/CodeGen/AArch64/redundant-orrwrs-from-zero-extend.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/redundant-orrwrs-from-zero-extend.mir @@ -0,0 +1,69 @@ +# RUN: llc -mtriple=aarch64 -run-pass aarch64-mi-peephole-opt -verify-machineinstrs -o - %s | FileCheck %s +--- +name: test1 +# CHECK-LABEL: name: test1 +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr32 } + - { id: 2, class: gpr32 } + - { id: 3, class: gpr32 } + - { id: 4, class: gpr64 } +body: | + bb.0: + liveins: $w0, $w1 + + %0:gpr32 = COPY $w0 + %1:gpr32 = COPY $w1 + B %bb.1 + + bb.1: + %2:gpr32 = nsw ADDWrr %0, %1 + B %bb.2 + + bb.2: + ; CHECK-LABEL: bb.2: + ; CHECK-NOT: %3:gpr32 = ORRWrs $wzr, %2, 0 + ; The ORRWrs should be removed. + %3:gpr32 = ORRWrs $wzr, %2, 0 + %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 + B %bb.3 + + bb.3: + $x0 = COPY %4 + RET_ReallyLR implicit $x0 +... +--- +name: test2 +# CHECK-LABEL: name: test2 +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gpr64 } + - { id: 1, class: gpr32 } + - { id: 2, class: gpr32 } + - { id: 3, class: gpr64 } +body: | + bb.0: + liveins: $x0 + + %0:gpr64 = COPY $x0 + B %bb.1 + + bb.1: + %1:gpr32 = EXTRACT_SUBREG %0, %subreg.sub_32 + B %bb.2 + + bb.2: + ; CHECK-LABEL: bb.2: + ; CHECK: %2:gpr32 = ORRWrs $wzr, %1, 0 + ; The ORRWrs should not be removed. + %2:gpr32 = ORRWrs $wzr, %1, 0 + %3:gpr64 = SUBREG_TO_REG 0, %2, %subreg.sub_32 + B %bb.3 + + bb.3: + $x0 = COPY %3 + RET_ReallyLR implicit $x0 +...