diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -46,6 +46,21 @@ // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr // instructions. // +// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high +// 64-bits. For example, +// +// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr +// %2:fpr64 = MOVID 0 +// %4:fpr128 = IMPLICIT_DEF +// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub +// %6:fpr128 = IMPLICIT_DEF +// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub +// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 +// ==> +// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr +// %6:fpr128 = IMPLICIT_DEF +// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub +// //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -111,6 +126,7 @@ bool visitORR(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); bool visitINSviGPR(MachineInstr &MI, unsigned Opc); + bool visitINSvi64lane(MachineInstr &MI); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -592,6 +608,74 @@ return true; } +static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI) { + // ToDo: check and add more MIs which set zero for high 64bits. + switch (MI->getOpcode()) { + default: + break; + case AArch64::FCVTNv2i32: + case AArch64::FCVTNv4i16: + case AArch64::RSHRNv2i32_shift: + case AArch64::RSHRNv4i16_shift: + case AArch64::RSHRNv8i8_shift : + case AArch64::SHRNv2i32_shift: + case AArch64::SHRNv4i16_shift: + case AArch64::SHRNv8i8_shift: + return true; + } + + return false; +} + +bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { + // Check the MI for low 64-bits sets zero for high 64-bits implicitly. + // We are expecting below case. + // + // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr + // %6:fpr128 = IMPLICIT_DEF + // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub + // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 + MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); + if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) + return false; + Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg()); + if (!is64bitDefwithZeroHigh64bit(Low64MI)) + return false; + + // Check there is `mov 0` MI for high 64-bits. + // We are expecting below cases. + // + // %2:fpr64 = MOVID 0 + // %4:fpr128 = IMPLICIT_DEF + // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub + // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 + // or + // %5:fpr128 = MOVIv2d_ns 0 + // %6:fpr64 = COPY %5.dsub:fpr128 + // %8:fpr128 = IMPLICIT_DEF + // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub + // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 + MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); + if (High64MI->getOpcode() != AArch64::INSERT_SUBREG) + return false; + High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg()); + if (High64MI->getOpcode() == TargetOpcode::COPY) + High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg()); + if (High64MI->getOpcode() != AArch64::MOVID && + High64MI->getOpcode() != AArch64::MOVIv2d_ns) + return false; + if (High64MI->getOperand(1).getImm() != 0) + return false; + + // Let's remove MIs for high 64-bits. + Register OldDef = MI.getOperand(0).getReg(); + Register NewDef = MI.getOperand(1).getReg(); + MRI->replaceRegWith(OldDef, NewDef); + MI.eraseFromParent(); + + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -667,6 +751,9 @@ case AArch64::INSvi8gpr: Changed = visitINSviGPR(MI, AArch64::INSvi8lane); break; + case AArch64::INSvi64lane: + Changed = visitINSvi64lane(MI); + break; } } } diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -146,16 +146,14 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: movi.2d v1, #0000000000000000 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: uaddlv.4s d2, v1 -; CHECK-NEXT: str d3, [x0, #16] -; CHECK-NEXT: mov.d v1[0], v2[0] -; CHECK-NEXT: ucvtf.2d v1, v1 -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: mov.d v1[1], v0[0] -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: movi.2d v0, #0000000000000000 +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: uaddlv.4s d1, v0 +; CHECK-NEXT: str d2, [x0, #16] +; CHECK-NEXT: mov.d v0[0], v1[0] +; CHECK-NEXT: ucvtf.2d v0, v0 +; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: @@ -188,15 +186,13 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-LABEL: insert_vec_v5i64_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: str wzr, [x0, #16] -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: uaddlv.4s d2, v1 -; CHECK-NEXT: mov.d v1[0], v2[0] -; CHECK-NEXT: ucvtf.2d v1, v1 -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: mov.d v1[1], v0[0] -; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: uaddlv.4s d1, v0 +; CHECK-NEXT: mov.d v0[0], v1[0] +; CHECK-NEXT: ucvtf.2d v0, v0 +; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: @@ -255,16 +251,14 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) { ; CHECK-LABEL: insert_vec_v16i64_uaddlv_from_v4i16: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi.2d v1, #0000000000000000 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: uaddlv.4h s3, v1 -; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: mov.s v2[0], v3[0] -; CHECK-NEXT: ucvtf.2d v2, v2 -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: mov.d v2[1], v0[0] -; CHECK-NEXT: stp q2, q1, [x0] +; CHECK-NEXT: uaddlv.4h s2, v0 +; CHECK-NEXT: stp q0, q0, [x0, #32] +; CHECK-NEXT: mov.s v1[0], v2[0] +; CHECK-NEXT: ucvtf.2d v1, v1 +; CHECK-NEXT: fcvtn v1.2s, v1.2d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s + +declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) #2 +declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) #2 + +define <8 x half> @test1(<4 x float> noundef %a) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %vcvt_f16_f321.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) + %0 = bitcast <4 x i16> %vcvt_f16_f321.i to <4 x half> + %shuffle.i = shufflevector <4 x half> %0, <4 x half> zeroinitializer, <8 x i32> + ret <8 x half> %shuffle.i +} + +define <8 x i8> @test2(ptr nocapture noundef readonly %in, <8 x i8> noundef %idx) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: shrn v1.8b, v1.8h, #4 +; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b +; CHECK-NEXT: ret +entry: + %0 = load <8 x i16>, ptr %in, align 2 + %1 = lshr <8 x i16> %0, + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + %vtbl1.i = shufflevector <8 x i8> %vshrn_n, <8 x i8> zeroinitializer, <16 x i32> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %idx) + ret <8 x i8> %vtbl11.i +} + diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir --- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir +++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir @@ -79,10 +79,9 @@ ; CHECK-NEXT: [[FCVTNv2i32_:%[0-9]+]]:fpr64 = nofpexcept FCVTNv2i32 killed [[UCVTFv2f64_]], implicit $fpcr ; CHECK-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], killed [[FCVTNv2i32_]], %subreg.dsub - ; CHECK-NEXT: [[INSvi64lane1:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG2]], 1, killed [[INSERT_SUBREG1]], 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub ; CHECK-NEXT: STRDui killed [[COPY2]], [[COPY]], 2 :: (store (s64) into %ir.0 + 16) - ; CHECK-NEXT: STRQui killed [[INSvi64lane1]], [[COPY]], 0 :: (store (s128) into %ir.0, align 8) + ; CHECK-NEXT: STRQui killed [[INSERT_SUBREG2]], [[COPY]], 0 :: (store (s128) into %ir.0, align 8) ; CHECK-NEXT: RET_ReallyLR %0:gpr64common = COPY $x0 %1:fpr128 = MOVIv2d_ns 0