Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -110,11 +110,15 @@ if (!ST->hasMVEIntegerOps()) return false; - unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); - if (VecWidth != 128) + Type *ScalarTy = DataTy->getScalarType(); + + if (ScalarTy->isFloatTy() || ScalarTy->isHalfTy()) + return true; + + if (!ScalarTy->isIntegerTy()) return false; - unsigned EltWidth = DataTy->getScalarSizeInBits(); + unsigned EltWidth = ScalarTy->getIntegerBitWidth();; return EltWidth == 32 || EltWidth == 16 || EltWidth == 8; } Index: llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -20,38 +20,30 @@ } define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { -; CHECK-LABEL: foo_sext_v4i32_v4i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #3] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_sext_v4i32_v4i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrwt.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_sext_v4i32_v4i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q0 +; CHECK-BE-NEXT: vmovlb.s16 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrwt.32 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -62,37 +54,28 @@ } define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { -; CHECK-LABEL: foo_sext_v4i32_v4i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r1, [r2, #6] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_sext_v4i32_v4i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrwt.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_sext_v4i32_v4i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vmovlb.s16 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrwt.32 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -103,38 +86,30 @@ } define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { -; CHECK-LABEL: foo_zext_v4i32_v4i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #3] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_zext_v4i32_v4i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] +; CHECK-LE-NEXT: vmov.i32 q0, #0xff +; CHECK-LE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vldrbt.u8 q1, [r2] +; CHECK-LE-NEXT: vand q0, q1, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrwt.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_zext_v4i32_v4i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-BE-NEXT: vmov.i32 q1, #0xff +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vand q0, q0, q1 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrwt.32 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -145,37 +120,28 @@ } define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { -; CHECK-LABEL: foo_zext_v4i32_v4i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r1, [r2, #6] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_zext_v4i32_v4i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-LE-NEXT: vmovlb.u16 q0, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrwt.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_zext_v4i32_v4i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vmovlb.u16 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrwt.32 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -203,53 +169,28 @@ } define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { -; CHECK-LABEL: foo_sext_v8i16_v8i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.16 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.16 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.16 q0[2], r3 -; CHECK-NEXT: lsls r3, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #3] -; CHECK-NEXT: vmovmi.16 q0[3], r3 -; CHECK-NEXT: lsls r3, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.16 q0[4], r3 -; CHECK-NEXT: lsls r3, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #5] -; CHECK-NEXT: vmovmi.16 q0[5], r3 -; CHECK-NEXT: lsls r3, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #6] -; CHECK-NEXT: vmovmi.16 q0[6], r3 -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #7] -; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_sext_v8i16_v8i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q0, [r1] +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrht.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_sext_v8i16_v8i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-BE-NEXT: vrev16.8 q0, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrht.16 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -260,53 +201,28 @@ } define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { -; CHECK-LABEL: foo_zext_v8i16_v8i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.16 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.16 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.16 q0[2], r3 -; CHECK-NEXT: lsls r3, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #3] -; CHECK-NEXT: vmovmi.16 q0[3], r3 -; CHECK-NEXT: lsls r3, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.16 q0[4], r3 -; CHECK-NEXT: lsls r3, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #5] -; CHECK-NEXT: vmovmi.16 q0[5], r3 -; CHECK-NEXT: lsls r3, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #6] -; CHECK-NEXT: vmovmi.16 q0[6], r3 -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #7] -; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_zext_v8i16_v8i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q0, [r1] +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-LE-NEXT: vmovlb.u8 q0, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrht.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_zext_v8i16_v8i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrbt.u8 q0, [r2] +; CHECK-BE-NEXT: vrev16.8 q0, q0 +; CHECK-BE-NEXT: vmovlb.u8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrht.16 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -334,51 +250,25 @@ } define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) { -; CHECK-LABEL: foo_trunc_v8i8_v8i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q0, [r2] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne.u16 r2, q0[0] -; CHECK-NEXT: strbne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[1] -; CHECK-NEXT: strbmi r2, [r0, #1] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[2] -; CHECK-NEXT: strbmi r2, [r0, #2] -; CHECK-NEXT: lsls r2, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[3] -; CHECK-NEXT: strbmi r2, [r0, #3] -; CHECK-NEXT: lsls r2, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[4] -; CHECK-NEXT: strbmi r2, [r0, #4] -; CHECK-NEXT: lsls r2, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[5] -; CHECK-NEXT: strbmi r2, [r0, #5] -; CHECK-NEXT: lsls r2, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[6] -; CHECK-NEXT: strbmi r2, [r0, #6] -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r1, q0[7] -; CHECK-NEXT: strbmi r1, [r0, #7] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v8i8_v8i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q0, [r1] +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vpstt +; CHECK-LE-NEXT: vldrht.u16 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v8i8_v8i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrht.u16 q0, [r2] +; CHECK-BE-NEXT: vrev16.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -389,35 +279,25 @@ } define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LABEL: foo_trunc_v4i8_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r2] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r2, s0 -; CHECK-NEXT: strbne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s1 -; CHECK-NEXT: strbmi r2, [r0, #1] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s2 -; CHECK-NEXT: strbmi r2, [r0, #2] -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: strbmi r1, [r0, #3] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v4i8_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vpstt +; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v4i8_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -428,35 +308,25 @@ } define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LABEL: foo_trunc_v4i16_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r2] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r2, s0 -; CHECK-NEXT: strhne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s1 -; CHECK-NEXT: strhmi r2, [r0, #2] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s2 -; CHECK-NEXT: strhmi r2, [r0, #4] -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: strhmi r1, [r0, #6] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v4i16_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vpstt +; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v4i16_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer