diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1013,6 +1013,8 @@ setTargetDAGCombine(ISD::VECREDUCE_OR); setTargetDAGCombine(ISD::VECREDUCE_XOR); + setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); + // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemset = @@ -23121,6 +23123,55 @@ return SDValue(); } +static SDValue +performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Let's do below transform. + // + // t34: v4i32 = AArch64ISD::UADDLV t2 + // t35: i32 = extract_vector_elt t34, Constant:i64<0> + // t7: i64 = zero_extend t35 + // t20: v1i64 = scalar_to_vector t7 + // ==> + // t34: v4i32 = AArch64ISD::UADDLV t2 + // t39: v2i32 = extract_subvector t34, Constant:i64<0> + // t40: v1i64 = AArch64ISD::NVCAST t39 + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::v1i64) + return SDValue(); + + SDValue ZEXT = N->getOperand(0); + if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64) + return SDValue(); + + SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0); + if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + EXTRACT_VEC_ELT.getValueType() != MVT::i32) + return SDValue(); + + if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1))) + return SDValue(); + + SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0); + if (UADDLV.getOpcode() != AArch64ISD::UADDLV || + UADDLV.getValueType() != MVT::v4i32 || + UADDLV.getOperand(0).getValueType() != MVT::v8i8) + return SDValue(); + + // Let's generate new sequence with AArch64ISD::NVCAST. + SDLoc DL(N); + SDValue EXTRACT_SUBVEC = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV, + DAG.getConstant(0, DL, MVT::i64)); + SDValue NVCAST = + DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC); + + return NVCAST; +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -23436,6 +23487,8 @@ return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); case ISD::CTLZ: return performCTLZCombine(N, DAG, Subtarget); + case ISD::SCALAR_TO_VECTOR: + return performScalarToVectorCombine(N, DCI, DAG); } return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll --- a/llvm/test/CodeGen/AArch64/neon-addlv.ll +++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll @@ -178,8 +178,8 @@ ret i32 %0 } -define dso_local <8 x i8> @bar(<8 x i8> noundef %a) local_unnamed_addr #0 { -; CHECK-LABEL: bar: +define dso_local <8 x i8> @uaddlv_v8i8_dup(<8 x i8> %a) { +; CHECK-LABEL: uaddlv_v8i8_dup: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: uaddlv h0, v0.8b ; CHECK-NEXT: dup v0.8h, v0.h[0] @@ -195,3 +195,23 @@ } declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) + +declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64) + +define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) { +; CHECK-LABEL: uaddlv_v8i8_urshr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: urshr d0, d0, #3 +; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) + %0 = and i32 %vaddlv.i, 65535 + %conv = zext i32 %0 to i64 + %vrshr_n = tail call i64 @llvm.aarch64.neon.urshl.i64(i64 %conv, i64 -3) + %conv1 = trunc i64 %vrshr_n to i8 + %vecinit.i = insertelement <8 x i8> undef, i8 %conv1, i64 0 + %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer + ret <8 x i8> %vecinit7.i +}