diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18029,7 +18029,76 @@ } } - return SDValue(); + // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y)) + // Only implemented on little-endian subtargets. + bool IsLittleEndian = DAG.getDataLayout().isLittleEndian(); + + // This optimization only works on little endian. + if (!IsLittleEndian) + return SDValue(); + + if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8) + return SDValue(); + + auto getSourceOp = [](SDValue Operand) -> SDValue { + const unsigned Opcode = Operand.getOpcode(); + if (Opcode == ISD::TRUNCATE) + return Operand->getOperand(0); + if (Opcode == ISD::BITCAST && + Operand->getOperand(0).getOpcode() == ISD::TRUNCATE) + return Operand->getOperand(0)->getOperand(0); + return SDValue(); + }; + + SDValue SourceOp0 = getSourceOp(Op0); + SDValue SourceOp1 = getSourceOp(Op1); + + if (!SourceOp0 || !SourceOp1) + return SDValue(); + + if (SourceOp0.getValueType() != SourceOp1.getValueType() || + !SourceOp0.getValueType().isSimple()) + return SDValue(); + + EVT ResultTy; + + switch (SourceOp0.getSimpleValueType().SimpleTy) { + case MVT::v2i64: + ResultTy = MVT::v4i32; + break; + case MVT::v4i32: + ResultTy = MVT::v8i16; + break; + case MVT::v8i16: + ResultTy = MVT::v16i8; + break; + default: + return SDValue(); + } + + SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0); + SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1); + SDValue UzpResult = + DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1); + + EVT BitcastResultTy; + + switch (ResVT.getSimpleVT().SimpleTy) { + case MVT::v2i32: + BitcastResultTy = MVT::v2i64; + break; + case MVT::v4i16: + BitcastResultTy = MVT::v4i32; + break; + case MVT::v8i8: + BitcastResultTy = MVT::v8i16; + break; + default: + llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}"); + } + + return DAG.getNode(ISD::TRUNCATE, DL, ResVT, + DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult)); } static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { diff --git a/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll b/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll --- a/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll @@ -2,14 +2,11 @@ ; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LE %s ; RUN: llc < %s -mtriple aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s -; Test cases to show when UZP1 (TRUNC, TRUNC) could be combined to TRUNC (UZP1) but not yet implemented. - define <4 x i16> @test_combine_v4i16_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LE-LABEL: test_combine_v4i16_v2i64: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: xtn v0.2s, v0.2d -; CHECK-LE-NEXT: xtn v1.2s, v1.2d -; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-LE-NEXT: xtn v0.4h, v0.4s ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_combine_v4i16_v2i64: @@ -36,9 +33,8 @@ define <4 x i16> @test_combine_v4i16_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LE-LABEL: test_combine_v4i16_v4i32: ; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-LE-NEXT: xtn v0.4h, v0.4s -; CHECK-LE-NEXT: xtn v1.4h, v1.4s -; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_combine_v4i16_v4i32: @@ -62,9 +58,8 @@ define <4 x i16> @test_combine_v4i16_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LE-LABEL: test_combine_v4i16_v8i16: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: xtn v0.8b, v0.8h -; CHECK-LE-NEXT: xtn v1.8b, v1.8h -; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-LE-NEXT: xtn v0.4h, v0.4s ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_combine_v4i16_v8i16: @@ -94,9 +89,8 @@ define <8 x i8> @test_combine_v8i8_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LE-LABEL: test_combine_v8i8_v2i64: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: xtn v0.2s, v0.2d -; CHECK-LE-NEXT: xtn v1.2s, v1.2d -; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-LE-NEXT: xtn v0.8b, v0.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_combine_v8i8_v2i64: @@ -123,9 +117,8 @@ define <8 x i8> @test_combine_v8i8_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LE-LABEL: test_combine_v8i8_v4i32: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: xtn v0.4h, v0.4s -; CHECK-LE-NEXT: xtn v1.4h, v1.4s -; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-LE-NEXT: xtn v0.8b, v0.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_combine_v8i8_v4i32: @@ -154,9 +147,8 @@ define <8 x i8> @test_combine_v8i8_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LE-LABEL: test_combine_v8i8_v8i16: ; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b ; CHECK-LE-NEXT: xtn v0.8b, v0.8h -; CHECK-LE-NEXT: xtn v1.8b, v1.8h -; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_combine_v8i8_v8i16: @@ -266,9 +258,8 @@ define i8 @trunc_v4i64_v4i8(<4 x i64> %input) { ; CHECK-LE-LABEL: trunc_v4i64_v4i8: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: xtn v1.2s, v1.2d -; CHECK-LE-NEXT: xtn v0.2s, v0.2d -; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-LE-NEXT: xtn v0.4h, v0.4s ; CHECK-LE-NEXT: addv h0, v0.4h ; CHECK-LE-NEXT: fmov w0, s0 ; CHECK-LE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -2980,9 +2980,8 @@ ; CHECK-NEXT: mvni v2.4s, #127 ; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s -; CHECK-NEXT: xtn v1.4h, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptosi.sat.v8f32.v8i8(<8 x float> %f) ret <8 x i8> %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2491,9 +2491,8 @@ ; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s -; CHECK-NEXT: xtn v1.4h, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptoui.sat.v8f32.v8i8(<8 x float> %f) ret <8 x i8> %x