Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -15,6 +15,25 @@ // later. In this case, we could try to split the constant operand of mov // instruction into two bitmask immediates. It makes two AND instructions // intead of multiple `mov` + `and` instructions. +// +// 2. Remove redundant ORRWrs which is generated by zero-extend. +// +// There are 2 patterns for zero-extend as below. +// +// //In the case of a 32-bit def that is known to implicitly zero-extend, +// //we can use a SUBREG_TO_REG. +// def : Pat<(i64 (zext def32:$src)), +// (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; +// +// //When we need to explicitly zero-extend, we use a 32-bit MOV instruction +// //and then assert the extension has happened. +// def : Pat<(i64 (zext GPR32:$src)), +// (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; +// +// The def32 checks the $src needs explicitly zero-extend. However, it can +// not check the $src in other block and it adds ORRWrs conservatively in +// this case. This peephole optimization checks ORRWrs is for redundant +// zero-extend and try to remove it. //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -44,6 +63,8 @@ template bool visitAND(MachineInstr &MI, SmallSetVector &ToBeRemoved); + bool visitORR(MachineInstr &MI, + SmallSetVector &ToBeRemoved); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -180,6 +201,40 @@ return true; } +bool AArch64MIPeepholeOpt::visitORR( + MachineInstr &MI, SmallSetVector &ToBeRemoved) { + // Check this ORR comes from below zero-extend pattern. + // + // def : Pat<(i64 (zext GPR32:$src)), + // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; + if (MI.getOperand(3).getImm() != 0) + return false; + + // Check the definition of operand is in different block. + MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + if (SrcMI->getParent() == MI.getParent()) + return false; + + // Check the operand of ORR is EXTRACT_SUBREG from below truncate pattern. + // + // def : Pat<(i32 (trunc GPR64sp:$src)), + // (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>; + if (SrcMI->getOpcode() == TargetOpcode::EXTRACT_SUBREG) { + if (SrcMI->getOperand(2).getImm() == AArch64::sub_32) + return false; + } + + // Conservatively, do not remove ORR from below MIs. + if (SrcMI->getOpcode() == TargetOpcode::PHI || + SrcMI->getOpcode() == TargetOpcode::COPY) + return false; + + MRI->replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); + ToBeRemoved.insert(&MI); + + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -205,6 +260,8 @@ case AArch64::ANDXrr: Changed = visitAND(MI, ToBeRemoved); break; + case AArch64::ORRWrs: + Changed = visitORR(MI, ToBeRemoved); } } } Index: llvm/test/CodeGen/AArch64/redundant-mov-from-zero-extend.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/redundant-mov-from-zero-extend.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define i32 @test(i32 %input, i32 %n, i32 %a) { +; CHECK-LABEL: test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz w1, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %bb.0 +; CHECK-NEXT: add w8, w0, w1 +; CHECK-NEXT: cmp w8, #4 +; CHECK-NEXT: mov w0, #100 +; CHECK-NEXT: b.hi .LBB0_5 +; CHECK-NEXT: // %bb.3: // %bb.0 +; CHECK-NEXT: adrp x9, .LJTI0_0 +; CHECK-NEXT: add x9, x9, :lo12:.LJTI0_0 +; CHECK-NEXT: adr x10, .LBB0_4 +; CHECK-NEXT: ldrb w11, [x9, x8] +; CHECK-NEXT: add x10, x10, x11, lsl #2 +; CHECK-NEXT: br x10 +; CHECK-NEXT: .LBB0_4: // %sw.bb +; CHECK-NEXT: add w0, w2, #1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_5: // %bb.0 +; CHECK-NEXT: cmp w8, #200 +; CHECK-NEXT: b.ne .LBB0_10 +; CHECK-NEXT: // %bb.6: // %sw.bb7 +; CHECK-NEXT: add w0, w2, #7 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_7: // %sw.bb1 +; CHECK-NEXT: add w0, w2, #3 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_8: // %sw.bb3 +; CHECK-NEXT: add w0, w2, #4 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_9: // %sw.bb5 +; CHECK-NEXT: add w0, w2, #5 +; CHECK-NEXT: .LBB0_10: // %return +; CHECK-NEXT: ret +entry: + %b = add nsw i32 %input, %n + %cmp = icmp eq i32 %n, 0 + br i1 %cmp, label %bb.0, label %return + +bb.0: + switch i32 %b, label %return [ + i32 0, label %sw.bb + i32 1, label %sw.bb1 + i32 2, label %sw.bb3 + i32 4, label %sw.bb5 + i32 200, label %sw.bb7 + ] + +sw.bb: + %add = add nsw i32 %a, 1 + br label %return + +sw.bb1: + %add2 = add nsw i32 %a, 3 + br label %return + +sw.bb3: + %add4 = add nsw i32 %a, 4 + br label %return + +sw.bb5: + %add6 = add nsw i32 %a, 5 + br label %return + +sw.bb7: + %add8 = add nsw i32 %a, 7 + br label %return + +return: + %retval.0 = phi i32 [ %add8, %sw.bb7 ], [ %add6, %sw.bb5 ], [ %add4, %sw.bb3 ], [ %add2, %sw.bb1 ], [ %add, %sw.bb ], [ 100, %bb.0 ], [ 0, %entry ] + ret i32 %retval.0 +}