diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12100,6 +12100,7 @@ switch (II->getIntrinsicID()) { case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: + case Intrinsic::aarch64_neon_pmull: if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) { Ops.push_back(&II->getOperandUse(0)); Ops.push_back(&II->getOperandUse(1)); diff --git a/llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll b/llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-vmull-high-p8.ll @@ -0,0 +1,114 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s + +; This test checks that pmull2 instruction is used for vmull_high_p8 intrinsic. +; There are two shufflevector instructions located in different basic blocks: +; +; %shuffle.i3.i = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> +; %shuffle.i.i47 = shufflevector <16 x i8> %vuzp.i, <16 x i8> undef, <8 x i32> +; +; They are used by: +; +; @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i47, <8 x i8> %shuffle.i3.i) #2 +; +; We test that pattern replacing llvm.aarch64.neon.pmull64 with pmull2 +; would be applied. + +; IR for that test was generated from the following .cpp file: +; +; #include +; +; inline poly16x8_t vmull_low_p8(poly8x16_t a, poly8x16_t b) { +; return vmull_p8(vget_low_p8(a), vget_low_p8(b)); +; } +; +; poly16x8x2_t p(const poly8_t *input, int len, poly8x16_t x, poly8x16_t X) { +; auto ptr = input + len; +; auto L = vdupq_n_p16(*--ptr), H = L; +; while (ptr > input) +; { +; auto s = vuzpq_p8(vreinterpretq_p8_p16(L), vreinterpretq_p8_p16(H)); +; auto a = vmull_low_p8(s.val[0], x); +; auto b = vmull_high_p8(s.val[0], x); +; auto A = vmull_low_p8(s.val[1], X); +; auto B = vmull_high_p8(s.val[1], X); +; auto C = vdupq_n_p16(*--ptr); +; L = C ^ a ^ A; +; H = C ^ b ^ B; +; } +; return {L,H}; +; } + +;CHECK_LABEL: func: +;CHECK: pmull2 +; ModuleID = 'armc.cpp' +source_filename = "armc.cpp" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-linux-gnu" + +%struct.poly16x8x2_t = type { [2 x <8 x i16>] } + +; Function Attrs: mustprogress nofree nosync nounwind readonly uwtable willreturn +define dso_local %struct.poly16x8x2_t @_Z1pPKhi12__Poly8x16_tS1_(i8* readonly %input, i32 %len, <16 x i8> %x, <16 x i8> %X) local_unnamed_addr #0 { +entry: + %idx.ext = sext i32 %len to i64 + %incdec.ptr.idx = add nsw i64 %idx.ext, -1 + %incdec.ptr = getelementptr inbounds i8, i8* %input, i64 %incdec.ptr.idx + %0 = load i8, i8* %incdec.ptr, align 1, !tbaa !8 + %conv = zext i8 %0 to i16 + %vecinit.i = insertelement <8 x i16> undef, i16 %conv, i32 0 + %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> poison, <8 x i32> zeroinitializer + %shuffle.i3.i = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> + %shuffle.i4.i = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> + %shuffle.i3.i50 = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> + %shuffle.i4.i43 = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> + %cmp52 = icmp sgt i32 %len, 1 + br i1 %cmp52, label %while.body, label %while.end + +while.body: ; preds = %entry, %while.body + %ptr.055 = phi i8* [ %incdec.ptr14, %while.body ], [ %incdec.ptr, %entry ] + %L.054 = phi <8 x i16> [ %xor17, %while.body ], [ %vecinit7.i, %entry ] + %H.053 = phi <8 x i16> [ %xor19, %while.body ], [ %vecinit7.i, %entry ] + %1 = bitcast <8 x i16> %L.054 to <16 x i8> + %2 = bitcast <8 x i16> %H.053 to <16 x i8> + %vuzp.i = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + %vuzp1.i = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + %shuffle.i.i47 = shufflevector <16 x i8> %vuzp.i, <16 x i8> undef, <8 x i32> + %vmull.i.i48 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i47, <8 x i8> %shuffle.i3.i) #2 + %shuffle.i.i = shufflevector <16 x i8> %vuzp.i, <16 x i8> undef, <8 x i32> + %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i4.i) #2 + %shuffle.i.i49 = shufflevector <16 x i8> %vuzp1.i, <16 x i8> undef, <8 x i32> + %vmull.i.i51 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i49, <8 x i8> %shuffle.i3.i50) #2 + %shuffle.i.i42 = shufflevector <16 x i8> %vuzp1.i, <16 x i8> undef, <8 x i32> + %vmull.i.i44 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i42, <8 x i8> %shuffle.i4.i43) #2 + %incdec.ptr14 = getelementptr inbounds i8, i8* %ptr.055, i64 -1 + %3 = load i8, i8* %incdec.ptr14, align 1, !tbaa !8 + %conv15 = zext i8 %3 to i16 + %vecinit.i45 = insertelement <8 x i16> undef, i16 %conv15, i32 0 + %vecinit7.i46 = shufflevector <8 x i16> %vecinit.i45, <8 x i16> poison, <8 x i32> zeroinitializer + %xor = xor <8 x i16> %vmull.i.i51, %vmull.i.i48 + %xor17 = xor <8 x i16> %xor, %vecinit7.i46 + %xor18 = xor <8 x i16> %vmull.i.i44, %vmull.i.i + %xor19 = xor <8 x i16> %xor18, %vecinit7.i46 + %cmp = icmp ugt i8* %incdec.ptr14, %input + br i1 %cmp, label %while.body, label %while.end, !llvm.loop !11 + +while.end: ; preds = %while.body, %entry + %H.0.lcssa = phi <8 x i16> [ %vecinit7.i, %entry ], [ %xor19, %while.body ] + %L.0.lcssa = phi <8 x i16> [ %vecinit7.i, %entry ], [ %xor17, %while.body ] + %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %L.0.lcssa, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %H.0.lcssa, 0, 1 + ret %struct.poly16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn +declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) #1 + +attributes #0 = { mustprogress nofree nosync nounwind readonly uwtable willreturn "frame-pointer"="non-leaf" "min-legal-vector-width"="128" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" } +attributes #1 = { mustprogress nofree nosync nounwind readnone willreturn } +attributes #2 = { nounwind } + +!8 = !{!9, !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C++ TBAA"} +!11 = distinct !{!11, !12} +!12 = !{!"llvm.loop.mustprogress"}