Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -46,6 +46,21 @@ // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr // instructions. // +// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high +// 64-bits. For example, +// +// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr +// %2:fpr64 = MOVID 0 +// %4:fpr128 = IMPLICIT_DEF +// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub +// %6:fpr128 = IMPLICIT_DEF +// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub +// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 +// ==> +// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr +// %6:fpr128 = IMPLICIT_DEF +// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub +// //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -111,6 +126,7 @@ bool visitORR(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); bool visitINSviGPR(MachineInstr &MI, unsigned Opc); + bool visitINSvi64lane(MachineInstr &MI); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -592,6 +608,71 @@ return true; } +static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI) { + bool isSetZeroHigh64bits = false; + + // ToDo: check and add more MIs which set zero for high 64bits. + switch (MI->getOpcode()) { + default: + break; + case AArch64::FCVTNv4i16: + case AArch64::SHRNv8i8_shift: + isSetZeroHigh64bits = true; + break; + } + + return isSetZeroHigh64bits; +} + +bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { + // Check the MI for low 64-bits sets zero for high 64-bits implicitly. + // We are expecting below case. + // + // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr + // %6:fpr128 = IMPLICIT_DEF + // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub + // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 + MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); + if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) + return false; + Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg()); + if (!is64bitDefwithZeroHigh64bit(Low64MI)) + return false; + + // Check there is `mov 0` MI for high 64-bits. + // We are expecting below cases. + // + // %2:fpr64 = MOVID 0 + // %4:fpr128 = IMPLICIT_DEF + // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub + // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 + // or + // %5:fpr128 = MOVIv2d_ns 0 + // %6:fpr64 = COPY %5.dsub:fpr128 + // %8:fpr128 = IMPLICIT_DEF + // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub + // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 + MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); + if (High64MI->getOpcode() != AArch64::INSERT_SUBREG) + return false; + High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg()); + if (High64MI->getOpcode() == TargetOpcode::COPY) + High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg()); + if (High64MI->getOpcode() != AArch64::MOVID && + High64MI->getOpcode() != AArch64::MOVIv2d_ns) + return false; + if (High64MI->getOperand(1).getImm() != 0) + return false; + + // Let's remove MIs for high 64-bits. + Register OldDef = MI.getOperand(0).getReg(); + Register NewDef = MI.getOperand(1).getReg(); + MRI->replaceRegWith(OldDef, NewDef); + MI.eraseFromParent(); + + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -667,6 +748,9 @@ case AArch64::INSvi8gpr: Changed = visitINSviGPR(MI, AArch64::INSvi8lane); break; + case AArch64::INSvi64lane: + Changed = visitINSvi64lane(MI); + break; } } } Index: llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s + +declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) #2 +declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) #2 + +define nofpclass(nan inf) <8 x half> @test1(<4 x float> noundef nofpclass(nan inf) %a) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %vcvt_f16_f321.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) + %0 = bitcast <4 x i16> %vcvt_f16_f321.i to <4 x half> + %shuffle.i = shufflevector <4 x half> %0, <4 x half> zeroinitializer, <8 x i32> + ret <8 x half> %shuffle.i +} + +define <8 x i8> @test2(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone %dst, <8 x i8> noundef %idx) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: shrn v1.8b, v1.8h, #4 +; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b +; CHECK-NEXT: ret +entry: + %0 = load <8 x i16>, ptr %in, align 2 + %1 = lshr <8 x i16> %0, + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + %vtbl1.i = shufflevector <8 x i8> %vshrn_n, <8 x i8> zeroinitializer, <16 x i32> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %idx) + ret <8 x i8> %vtbl11.i +} +