diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16115,8 +16115,37 @@ } } + // Canonicalise concat_vectors to replace concatenations of truncated nots + // with nots of concatenated truncates. This in some cases allows for multiple + // redundant negations to be eliminated. + // (concat_vectors (v4i16 (truncate (not (v4i32)))), + // (v4i16 (truncate (not (v4i32))))) + // -> + // (not (concat_vectors (v4i16 (truncate (v4i32))), + // (v4i16 (truncate (v4i32))))) + if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && + N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) && + N->isOnlyUserOf(N1.getNode())) { + auto isBitwiseVectorNegate = [](SDValue V) { + return V->getOpcode() == ISD::XOR && + ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode()); + }; + SDValue N00 = N0->getOperand(0); + SDValue N10 = N1->getOperand(0); + if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) && + isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) { + return DAG.getNOT( + dl, + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(), + N00->getOperand(0)), + DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(), + N10->getOperand(0))), + VT); + } + } - // Wait 'til after everything is legalized to try this. That way we have + // Wait till after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll --- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -10,9 +10,8 @@ ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: umaxv b0, v0.8b ; CHECK-NEXT: fmov w9, s0 @@ -32,13 +31,10 @@ ; CHECK-NEXT: fcmgt v2.4s, v2.4s, #0.0 ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: bic w0, w8, w9 @@ -52,30 +48,23 @@ define i1 @unordered_floating_point_compare_on_v32f32(<32 x float> %a_vec) { ; CHECK-LABEL: unordered_floating_point_compare_on_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcmgt v7.4s, v7.4s, #0.0 -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: fcmgt v6.4s, v6.4s, #0.0 -; CHECK-NEXT: fcmgt v5.4s, v5.4s, #0.0 -; CHECK-NEXT: fcmgt v4.4s, v4.4s, #0.0 ; CHECK-NEXT: fcmgt v3.4s, v3.4s, #0.0 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: fcmgt v2.4s, v2.4s, #0.0 ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 -; CHECK-NEXT: mvn v7.16b, v7.16b -; CHECK-NEXT: mvn v6.16b, v6.16b -; CHECK-NEXT: mvn v5.16b, v5.16b -; CHECK-NEXT: mvn v4.16b, v4.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: uzp1 v6.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; CHECK-NEXT: fcmgt v7.4s, v7.4s, #0.0 +; CHECK-NEXT: fcmgt v6.4s, v6.4s, #0.0 +; CHECK-NEXT: fcmgt v5.4s, v5.4s, #0.0 +; CHECK-NEXT: fcmgt v4.4s, v4.4s, #0.0 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v1.16b, v4.16b, v6.16b +; CHECK-NEXT: uzp1 v6.8h, v6.8h, v7.8h +; CHECK-NEXT: uzp1 v1.8h, v4.8h, v5.8h ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v6.16b +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: bic w0, w9, w8 diff --git a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s + +define <8 x i16> @not_not_trunc_concat(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: not_not_trunc_concat: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ret + %notx = xor <4 x i32> %x, + %trnx = trunc <4 x i32> %notx to <4 x i16> + %noty = xor <4 x i32> %y, + %trny = trunc <4 x i32> %noty to <4 x i16> + %r = shufflevector <4 x i16> %trnx, <4 x i16> %trny, <8 x i32> + ret <8 x i16> %r +} + +; Chains of concat -> truncate -> negate should flatten out to a single negate. +define <16 x i8> @not_not_trunc_concat_chain(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: not_not_trunc_concat_chain: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ret + %nota = xor <4 x i32> %a, + %trna = trunc <4 x i32> %nota to <4 x i16> + %notb = xor <4 x i32> %b, + %trnb = trunc <4 x i32> %notb to <4 x i16> + %concat_a = shufflevector <4 x i16> %trna, <4 x i16> %trnb, <8 x i32> + %trun_concat_a = trunc <8 x i16> %concat_a to <8 x i8> + %notx = xor <4 x i32> %x, + %trnx = trunc <4 x i32> %notx to <4 x i16> + %noty = xor <4 x i32> %y, + %trny = trunc <4 x i32> %noty to <4 x i16> + %concat_b = shufflevector <4 x i16> %trnx, <4 x i16> %trny, <8 x i32> + %trun_concat_b = trunc <8 x i16> %concat_b to <8 x i8> + %r = shufflevector <8 x i8> %trun_concat_a, <8 x i8> %trun_concat_b, <16 x i32> + ret <16 x i8> %r +} + +; Combine should not fire here, otherwise slightly worse code will be emitted. +define <8 x i16> @not_not_trunc_concat_multiple_uses(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: not_not_trunc_concat_multiple_uses: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: mvn v1.8b, v1.8b +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mov v2.d[1], v1.d[0] +; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: add v0.8h, v2.8h, v0.8h +; CHECK-NEXT: ret + %notx = xor <4 x i32> %x, + %trnx = trunc <4 x i32> %notx to <4 x i16> + %noty = xor <4 x i32> %y, + %trny = trunc <4 x i32> %noty to <4 x i16> + %concat = shufflevector <4 x i16> %trnx, <4 x i16> %trny, <8 x i32> + %add = add <4 x i16> %trnx, %trny + %extend_add = shufflevector <4 x i16> %add, <4 x i16> %add, <8 x i32> + %r = add <8 x i16> %concat, %extend_add + ret <8 x i16> %r + +} +