diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5300,6 +5300,9 @@ // CodeGen patterns for addhn and subhn instructions, which can actually be // written in LLVM IR without too much difficulty. +// Prioritize ADDHN and SUBHN over UZP2. +let AddedComplexity = 10 in { + // ADDHN def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))), (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; @@ -5350,6 +5353,8 @@ (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), V128:$Rn, V128:$Rm)>; +} // AddedComplexity = 10 + //---------------------------------------------------------------------------- // AdvSIMD bitwise extract from vector instruction. //---------------------------------------------------------------------------- @@ -5416,6 +5421,19 @@ (v2i32 (trunc (v2i64 V128:$Vm))))), (UZP1v4i32 V128:$Vn, V128:$Vm)>; +def : Pat<(v16i8 (concat_vectors + (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))), + (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))), + (UZP2v16i8 V128:$Vn, V128:$Vm)>; +def : Pat<(v8i16 (concat_vectors + (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))), + (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))), + (UZP2v8i16 V128:$Vn, V128:$Vm)>; +def : Pat<(v4i32 (concat_vectors + (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))), + (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), + (UZP2v4i32 V128:$Vn, V128:$Vm)>; + //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX instructions //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/arm64-uzp2-combine.ll b/llvm/test/CodeGen/AArch64/arm64-uzp2-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-uzp2-combine.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s + +; Test the (concat_vectors (trunc (lshr)), (trunc (lshr))) pattern. + +define <16 x i8> @test_combine_v8i16_to_v16i8(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: test_combine_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %lshr1 = lshr <8 x i16> %x, + %trunc1 = trunc <8 x i16> %lshr1 to <8 x i8> + %lshr2 = lshr <8 x i16> %y, + %trunc2 = trunc <8 x i16> %lshr2 to <8 x i8> + %shuffle = shufflevector <8 x i8> %trunc1, <8 x i8> %trunc2, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_combine_v4i32_to_v8i16(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: test_combine_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %lshr1 = lshr <4 x i32> %x, + %trunc1 = trunc <4 x i32> %lshr1 to <4 x i16> + %lshr2 = lshr <4 x i32> %y, + %trunc2 = trunc <4 x i32> %lshr2 to <4 x i16> + %shuffle = shufflevector <4 x i16> %trunc1, <4 x i16> %trunc2, <8 x i32> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_combine_v2i64_to_v4i32(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: test_combine_v2i64_to_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %lshr1 = lshr <2 x i64> %x, + %trunc1 = trunc <2 x i64> %lshr1 to <2 x i32> + %lshr2 = lshr <2 x i64> %y, + %trunc2 = trunc <2 x i64> %lshr2 to <2 x i32> + %shuffle = shufflevector <2 x i32> %trunc1, <2 x i32> %trunc2, <4 x i32> + ret <4 x i32> %shuffle +} + diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll @@ -935,6 +935,25 @@ ret <4 x i32> %res } +define <4 x i32> @addhn_addhn2_4s(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C, <2 x i64>* %D) nounwind { +;CHECK-LABEL: addhn_addhn2_4s +;CHECK: addhn.2s +;CHECK: addhn2.4s +;CHECK-NOT: uzp2.4s + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %sum1 = add <2 x i64> %tmp1, %tmp2 + %low_bits = lshr <2 x i64> %sum1, + %narrowed1 = trunc <2 x i64> %low_bits to <2 x i32> + %tmp3 = load <2 x i64>, <2 x i64>* %C + %tmp4 = load <2 x i64>, <2 x i64>* %D + %sum2 = add <2 x i64> %tmp3, %tmp4 + %high_bits = lshr <2 x i64> %sum1, + %narrowed2 = trunc <2 x i64> %high_bits to <2 x i32> + %res = shufflevector <2 x i32> %narrowed1, <2 x i32> %narrowed2, <4 x i32> + ret <4 x i32> %res +} + define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: subhn8b_natural: ;CHECK: subhn.8b