diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6603,6 +6603,34 @@ defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; +// RADDHN patterns for when RSHRN shifts by half the size of the vector element +def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))), + (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))), + (RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))), + (RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>; + +// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))), + (RADDHNv8i16_v16i8 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v8i16 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v8i16 (concat_vectors + (v4i16 V64:$Vd), + (v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))), + (RADDHNv4i32_v8i16 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v4i32 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v4i32 (concat_vectors + (v2i32 V64:$Vd), + (v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))), + (RADDHNv2i64_v4i32 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v2i64 (MOVIv2d_ns (i32 0))))>; + // SHRN patterns for when a logical right shift was used instead of arithmetic // (the immediate guarantees no sign bits actually end up in the result so it // doesn't matter). diff --git a/llvm/test/CodeGen/AArch64/arm64-raddhn-combine.ll b/llvm/test/CodeGen/AArch64/arm64-raddhn-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-raddhn-combine.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s + +define <16 x i8> @test_combine_v8i16_to_v16i8(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: test_combine_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: raddhn v0.8b, v0.8h, v2.8h +; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h +; CHECK-NEXT: ret +entry: + %res = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %x, i32 8) + %res2 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %y, i32 8) + %shuffle = shufflevector <8 x i8> %res, <8 x i8> %res2, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_combine_v4i32_to_v8i16(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: test_combine_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: raddhn v0.4h, v0.4s, v2.4s +; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s +; CHECK-NEXT: ret +entry: + %res = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %x, i32 16) + %res2 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %y, i32 16) + %shuffle = shufflevector <4 x i16> %res, <4 x i16> %res2, <8 x i32> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_combine_v2i64_to_v4i32(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: test_combine_v2i64_to_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: raddhn v0.2s, v0.2d, v2.2d +; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d +; CHECK-NEXT: ret +entry: + %res = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %x, i32 32) + %res2 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %y, i32 32) + %shuffle = shufflevector <2 x i32> %res, <2 x i32> %res2, <4 x i32> + ret <4 x i32> %shuffle +} + +declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) +declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) +declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)