diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1216,6 +1216,14 @@ setOperationAction(ISD::BITCAST, MVT::i8, Custom); setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setTruncStoreAction(MVT::v16i8, MVT::v16i1, Custom); + setTruncStoreAction(MVT::v8i16, MVT::v8i1, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i1, Custom); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Custom); + setTruncStoreAction(MVT::v8i8, MVT::v16i1, Custom); + setTruncStoreAction(MVT::v4i16, MVT::v4i1, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v16i1, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); @@ -19607,20 +19615,21 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue ComparisonResult(N, 0); - EVT BoolVecVT = ComparisonResult.getValueType(); - assert(BoolVecVT.isVector() && "Must be a vector type"); + EVT VecVT = ComparisonResult.getValueType(); + assert(VecVT.isVector() && "Must be a vector type"); - unsigned NumElts = BoolVecVT.getVectorNumElements(); + unsigned NumElts = VecVT.getVectorNumElements(); if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return SDValue(); // If we can find the original types to work on instead of a vector of i1, // we can avoid extend/extract conversion instructions. - EVT VecVT = tryGetOriginalBoolVectorType(ComparisonResult); - if (!VecVT.isSimple()) { - unsigned BitsPerElement = std::max(64 / NumElts, 8u); // min. 64-bit vector - VecVT = - BoolVecVT.changeVectorElementType(MVT::getIntegerVT(BitsPerElement)); + if (VecVT.getVectorElementType() == MVT::i1) { + VecVT = tryGetOriginalBoolVectorType(ComparisonResult); + if (!VecVT.isSimple()) { + unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector + VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts); + } } VecVT = VecVT.changeVectorElementTypeToInteger(); @@ -19681,6 +19690,37 @@ return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits); } +static SDValue combineVectorCompareAndTruncateStore(SelectionDAG &DAG, + StoreSDNode *Store) { + if (!Store->isTruncatingStore()) + return SDValue(); + + SDLoc DL(Store); + SDValue VecOp = Store->getValue(); + EVT VT = VecOp.getValueType(); + EVT MemVT = Store->getMemoryVT(); + + if (!MemVT.isVector() || !VT.isVector() || + MemVT.getVectorElementType() != MVT::i1) + return SDValue(); + + // If we are storing a vector that we are currently building, let + // `scalarizeVectorStore()` handle this more efficiently. + if (VecOp.getOpcode() == ISD::BUILD_VECTOR) + return SDValue(); + + VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp); + SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG); + if (!VectorBits) + return SDValue(); + + EVT StoreVT = + EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits()); + SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT); + return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(), + Store->getMemOperand()); +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -19719,6 +19759,9 @@ if (SDValue Store = foldTruncStoreOfExt(DAG, N)) return Store; + if (SDValue Store = combineVectorCompareAndTruncateStore(DAG, ST)) + return Store; + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll --- a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll +++ b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll @@ -1,9 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) { ; CHECK-LABEL: test_mismatched_setcc: -; CHECK: cmeq [[CMP128:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s -; CHECK: xtn {{v[0-9]+}}.4h, [[CMP128]].4s +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4s, #63, msl #16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret %tst = icmp eq <4 x i22> %l, %r store <4 x i1> %tst, ptr %addr diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -418,12 +418,33 @@ ret i4 %bitmask } -; TODO(lawben): Change this in follow-up patch to #D145301, as truncating stores fix this. -; Larger vector types don't map directly. -define i8 @no_convert_large_vector(<8 x i32> %vec) { +; Larger vector types don't map directly, but the can be split/truncated and then converted. +; After the comparison against 0, this is truncated to <8 x i16>, which is valid again. +define i8 @convert_large_vector(<8 x i32> %vec) { +; CHECK-LABEL: lCPI15_0: +; CHECK-NEXT: .short 1 +; CHECK-NEXT: .short 2 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short 8 +; CHECK-NEXT: .short 16 +; CHECK-NEXT: .short 32 +; CHECK-NEXT: .short 64 +; CHECK-NEXT: .short 128 + ; CHECK-LABEL: convert_large_vector: -; CHECK: cmeq.4s v1, v1, #0 -; CHECK-NOT: addv +; CHECK: Lloh30: +; CHECK-NEXT: adrp x8, lCPI15_0@PAGE +; CHECK-NEXT: cmeq.4s v1, v1, #0 +; CHECK-NEXT: cmeq.4s v0, v0, #0 +; CHECK-NEXT: uzp1.8h v0, v0, v1 +; CHECK-NEXT: Lloh31: +; CHECK-NEXT: ldr q1, [x8, lCPI15_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer %bitmask = bitcast <8 x i1> %cmp_result to i8 diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll @@ -0,0 +1,281 @@ +; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s + +define void @store_16_elements(<16 x i8> %vec, ptr %out) { +; Bits used in mask +; CHECK-LABEL: lCPI0_0 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 32 +; CHECK-NEXT: .byte 64 +; CHECK-NEXT: .byte 128 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 32 +; CHECK-NEXT: .byte 64 +; CHECK-NEXT: .byte 128 + +; Actual conversion +; CHECK-LABEL: store_16_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x8, lCPI0_0@PAGE +; CHECK-NEXT: cmeq.16b v0, v0, #0 +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-NEXT: addv.8b b0, v0 +; CHECK-NEXT: addv.8b b1, v1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: orr w8, w9, w8, lsl #8 +; CHECK-NEXT: strh w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer + store <16 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_8_elements(<8 x i16> %vec, ptr %out) { +; CHECK-LABEL: lCPI1_0: +; CHECK-NEXT: .short 1 +; CHECK-NEXT: .short 2 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short 8 +; CHECK-NEXT: .short 16 +; CHECK-NEXT: .short 32 +; CHECK-NEXT: .short 64 +; CHECK-NEXT: .short 128 + +; CHECK-LABEL: store_8_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x8, lCPI1_0@PAGE +; CHECK-NEXT: cmeq.8h v0, v0, #0 +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer + store <8 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_4_elements(<4 x i32> %vec, ptr %out) { +; CHECK-LABEL: lCPI2_0: +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long 4 +; CHECK-NEXT: .long 8 + +; CHECK-LABEL: store_4_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x8, lCPI2_0@PAGE +; CHECK-NEXT: cmeq.4s v0, v0, #0 +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addv.4s s0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer + store <4 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_2_elements(<2 x i64> %vec, ptr %out) { +; CHECK-LABEL: lCPI3_0: +; CHECK-NEXT: .quad 1 +; CHECK-NEXT: .quad 2 + +; CHECK-LABEL: store_2_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: adrp x8, lCPI3_0@PAGE +; CHECK-NEXT: cmeq.2d v0, v0, #0 +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q1, [x8, lCPI3_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addp.2d d0, v0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer + store <2 x i1> %cmp_result, ptr %out + ret void +} + +define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) { +; CHECK-LABEL: lCPI4_0: +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long 4 +; CHECK-NEXT: .long 8 + +; CHECK-LABEL: add_trunc_compare_before_store +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x8, lCPI4_0@PAGE +; CHECK-NEXT: shl.4s v0, v0, #31 +; CHECK-NEXT: cmlt.4s v0, v0, #0 +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: ldr q1, [x8, lCPI4_0@PAGEOFF] +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: addv.4s s0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %trunc = trunc <4 x i32> %vec to <4 x i1> + store <4 x i1> %trunc, ptr %out + ret void +} + +define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) { +; CHECK-LABEL: lCPI5_0: +; CHECK: .short 1 +; CHECK: .short 2 +; CHECK: .short 4 +; CHECK: .short 8 + +; CHECK-LABEL: add_trunc_mask_unknown_vector_type +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: adrp x8, lCPI5_0@PAGE +; CHECK-NEXT: shl.4h v0, v0, #15 +; CHECK-NEXT: cmlt.4h v0, v0, #0 +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: ldr d1, [x8, lCPI5_0@PAGEOFF] +; CHECK-NEXT: and.8b v0, v0, v1 +; CHECK-NEXT: addv.4h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + store <4 x i1> %vec, ptr %out + ret void +} + +define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) { +; CHECK-LABEL: lCPI6_0: +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 32 +; CHECK-NEXT: .byte 64 +; CHECK-NEXT: .byte 128 + +; CHECK-LABEL: store_8_elements_64_bit_vector +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh12: +; CHECK-NEXT: adrp x8, lCPI6_0@PAGE +; CHECK-NEXT: cmeq.8b v0, v0, #0 +; CHECK-NEXT: Lloh13: +; CHECK-NEXT: ldr d1, [x8, lCPI6_0@PAGEOFF] +; CHECK-NEXT: bic.8b v0, v1, v0 +; CHECK-NEXT: addv.8b b0, v0 +; CHECK-NEXT: st1.b { v0 }[0], [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <8 x i8> %vec, zeroinitializer + store <8 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) { +; CHECK-LABEL: lCPI7_0: +; CHECK-NEXT: .short 1 +; CHECK-NEXT: .short 2 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short 8 + +; CHECK-LABEL: store_4_elements_64_bit_vector +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh14: +; CHECK-NEXT: adrp x8, lCPI7_0@PAGE +; CHECK-NEXT: cmeq.4h v0, v0, #0 +; CHECK-NEXT: Lloh15: +; CHECK-NEXT: ldr d1, [x8, lCPI7_0@PAGEOFF] +; CHECK-NEXT: bic.8b v0, v1, v0 +; CHECK-NEXT: addv.4h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <4 x i16> %vec, zeroinitializer + store <4 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) { +; CHECK-LABEL: lCPI8_0: +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long 2 + +; CHECK-LABEL: store_2_elements_64_bit_vector +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh16: +; CHECK-NEXT: adrp x8, lCPI8_0@PAGE +; CHECK-NEXT: cmeq.2s v0, v0, #0 +; CHECK-NEXT: Lloh17: +; CHECK-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF] +; CHECK-NEXT: bic.8b v0, v1, v0 +; CHECK-NEXT: addp.2s v0, v0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer + store <2 x i1> %cmp_result, ptr %out + ret void +} + +define void @no_combine_without_truncate(<16 x i8> %vec, ptr %out) { +; CHECK-LABEL: no_combine_without_truncate +; CHECK: cmtst.16b v0, v0, v0 +; CHECK-NOT: addv.8b b0, v0 + + %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer + %extended_result = sext <16 x i1> %cmp_result to <16 x i8> + store <16 x i8> %extended_result, ptr %out + ret void +} + +define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) { +; CHECK-LABEL: no_combine_for_non_bool_truncate +; CHECK: xtn.4h v0, v0 +; CHECK-NOT: addv.4s s0, v0 + + %trunc = trunc <4 x i32> %vec to <4 x i8> + store <4 x i8> %trunc, ptr %out + ret void +} + +define void @no_combine_for_build_vector(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) { +; CHECK-LABEL: no_combine_for_build_vector +; CHECK-NOT: addv + + %1 = insertelement <4 x i1> undef, i1 %a, i64 0 + %2 = insertelement <4 x i1> %1, i1 %b, i64 1 + %3 = insertelement <4 x i1> %2, i1 %c, i64 2 + %vec = insertelement <4 x i1> %3, i1 %d, i64 3 + store <4 x i1> %vec, ptr %out + ret void +} diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function uaddo_v4i1 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) @@ -246,22 +246,20 @@ ; CHECK-LABEL: uaddo_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.4h, #1 +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI10_0] ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: and v1.8b, v0.8b, v2.8b -; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h -; CHECK-NEXT: and w8, w8, #0x1 -; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: shl v1.4h, v0.4h, #15 +; CHECK-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-NEXT: cmeq v0.4h, v2.4h, v0.4h +; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 ; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: bfi w8, w10, #2, #1 -; CHECK-NEXT: orr w8, w8, w11, lsl #3 -; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: addv h1, v1.4h ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function umulo_v4i1 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>) @@ -296,18 +296,15 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-LABEL: umulo_v4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: addv h1, v0.4h ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: and v1.8b, v2.8b, v1.8b -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: umov w10, v1.h[2] -; CHECK-NEXT: umov w11, v1.h[3] -; CHECK-NEXT: and w8, w8, #0x1 -; CHECK-NEXT: bfi w8, w9, #1, #1 -; CHECK-NEXT: bfi w8, w10, #2, #1 -; CHECK-NEXT: orr w8, w8, w11, lsl #3 -; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)