Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5753,7 +5753,7 @@ (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))), (i64 0))), (i64 0))), (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), - (UADDLVv4i16v V64:$op), ssub), ssub)>; + (UADDLVv8i8v V64:$op), hsub), ssub)>; def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), (i64 0))), (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), Index: llvm/test/CodeGen/AArch64/neon-uaddlv.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-uaddlv.ll +++ llvm/test/CodeGen/AArch64/neon-uaddlv.ll @@ -13,11 +13,13 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) nounwind readnone declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) nounwind readnone +declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) + define i16 @uaddlv4h_from_v8i8(<8 x i8>* %A) nounwind { ; CHECK-LABEL: uaddlv4h_from_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: uaddlv s0, v0.4h +; CHECK-NEXT: uaddlv h0, v0.8b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A @@ -77,3 +79,19 @@ %tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3) ret i32 %tmp5 } + +define i16 @count([4 x i16]* %0) { +; CHECK-LABEL: count: +; CHECK: // %bb.0: // %Entry +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +Entry: + %1 = bitcast [4 x i16]* %0 to <4 x i16>* + %2 = load <4 x i16>, <4 x i16>* %1, align 2 + %3 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %2) + %4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %3) + ret i16 %4 +}