diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5118,17 +5118,31 @@ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4; }]>; -def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ - return cast(N)->getAlignment() >= 4; +def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; -def alignedmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred), [{ - return cast(N)->getAlignment() >= 2; +def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore8 node:$val, node:$ptr, node:$pred), [{ + return cast(N)->isTruncatingStore(); +}]>; +def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; }]>; -def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred), - (masked_st node:$val, node:$ptr, node:$pred)>; +def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore16 node:$val, node:$ptr, node:$pred), [{ + return cast(N)->isTruncatingStore(); +}]>; +def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; +}]>; let Predicates = [HasMVEInt, IsLE] in { // Stores @@ -5148,16 +5162,6 @@ defm : MVE_vector_offset_store; defm : MVE_vector_offset_store; defm : MVE_vector_offset_store; - - // Unaligned masked stores (aligned are below) - def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; } let Predicates = [HasMVEInt, IsBE] in { @@ -5212,25 +5216,22 @@ def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; - - // Unaligned masked stores (aligned are below) - def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), - (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; } let Predicates = [HasMVEInt] in { // Aligned masked store, shared between LE and BE - def : MVE_vector_maskedstore_typed; - def : MVE_vector_maskedstore_typed; - def : MVE_vector_maskedstore_typed; - def : MVE_vector_maskedstore_typed; - def : MVE_vector_maskedstore_typed; + def : MVE_vector_maskedstore_typed; + def : MVE_vector_maskedstore_typed; + def : MVE_vector_maskedstore_typed; + def : MVE_vector_maskedstore_typed; + def : MVE_vector_maskedstore_typed; + // Truncating stores + def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred), + (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>; // Aligned masked loads def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -154,6 +154,7 @@ } bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment); + bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) { return isLegalMaskedLoad(DataTy, Alignment); } diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -783,23 +783,13 @@ } define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) { -; CHECK-LE-LABEL: foo_trunc_v8i8_v8i16: -; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldrh.u16 q0, [r1] -; CHECK-LE-NEXT: vptt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrht.u16 q0, [r2] -; CHECK-LE-NEXT: vstrbt.8 q0, [r0] -; CHECK-LE-NEXT: bx lr -; -; CHECK-BE-LABEL: foo_trunc_v8i8_v8i16: -; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldrh.u16 q0, [r1] -; CHECK-BE-NEXT: vpt.s16 gt, q0, zr -; CHECK-BE-NEXT: vldrht.u16 q0, [r2] -; CHECK-BE-NEXT: vrev16.8 q0, q0 -; CHECK-BE-NEXT: vpst -; CHECK-BE-NEXT: vstrbt.8 q0, [r0] -; CHECK-BE-NEXT: bx lr +; CHECK-LABEL: foo_trunc_v8i8_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vptt.s16 gt, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r2] +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -810,23 +800,13 @@ } define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LE-LABEL: foo_trunc_v4i8_v4i32: -; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: vptt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] -; CHECK-LE-NEXT: vstrbt.8 q0, [r0] -; CHECK-LE-NEXT: bx lr -; -; CHECK-BE-LABEL: foo_trunc_v4i8_v4i32: -; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldrw.u32 q0, [r1] -; CHECK-BE-NEXT: vpt.s32 gt, q0, zr -; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] -; CHECK-BE-NEXT: vrev32.8 q0, q0 -; CHECK-BE-NEXT: vpst -; CHECK-BE-NEXT: vstrbt.8 q0, [r0] -; CHECK-BE-NEXT: bx lr +; CHECK-LABEL: foo_trunc_v4i8_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r2] +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -837,23 +817,13 @@ } define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LE-LABEL: foo_trunc_v4i16_v4i32: -; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: vptt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] -; CHECK-LE-NEXT: vstrbt.8 q0, [r0] -; CHECK-LE-NEXT: bx lr -; -; CHECK-BE-LABEL: foo_trunc_v4i16_v4i32: -; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldrw.u32 q0, [r1] -; CHECK-BE-NEXT: vpt.s32 gt, q0, zr -; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] -; CHECK-BE-NEXT: vrev32.8 q0, q0 -; CHECK-BE-NEXT: vpst -; CHECK-BE-NEXT: vstrbt.8 q0, [r0] -; CHECK-BE-NEXT: bx lr +; CHECK-LABEL: foo_trunc_v4i16_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r2] +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -1102,11 +1102,738 @@ ret void } +define arm_aapcs_vfpcc void @masked_v4i16(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vstrht.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vstrht.32 q1, [r0] +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %trunc = trunc <4 x i32> %a to <4 x i16> + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %trunc, <4 x i16>* %dest, i32 2, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4i8(<4 x i8> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vstrbt.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vstrbt.32 q1, [r0] +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %trunc = trunc <4 x i32> %a to <4 x i8> + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %trunc, <4 x i8>* %dest, i32 1, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v8i8(<8 x i8> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vstrbt.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vstrbt.16 q1, [r0] +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %trunc = trunc <8 x i16> %a to <8 x i8> + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %trunc, <8 x i8>* %dest, i32 1, <8 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4i16_align1(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i16_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r1, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r2, s0 +; CHECK-LE-NEXT: strhne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s1 +; CHECK-LE-NEXT: strhmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s2 +; CHECK-LE-NEXT: strhmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s3 +; CHECK-LE-NEXT: strhmi r1, [r0, #6] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i16_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r2, s4 +; CHECK-BE-NEXT: strhne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: strhmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: strhmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s7 +; CHECK-BE-NEXT: strhmi r1, [r0, #6] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %trunc = trunc <4 x i32> %a to <4 x i16> + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %trunc, <4 x i16>* %dest, i32 1, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float> %a) { +; CHECK-LE-LABEL: masked_v4f16_align4: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcmp.f32 s0, #0 +; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s1 +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q1[0], r1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s2 +; CHECK-LE-NEXT: vmov.16 q1[1], r2 +; CHECK-LE-NEXT: vmov r1, s8 +; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s3 +; CHECK-LE-NEXT: vmov.16 q1[2], r1 +; CHECK-LE-NEXT: vmov r1, s8 +; CHECK-LE-NEXT: vmov.16 q1[3], r1 +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: cset r1, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: vcmp.f32 s2, #0 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: cset r3, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: and r3, r3, #1 +; CHECK-LE-NEXT: vcmp.f32 s3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: mov.w r2, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: cset r3, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r2, #1 +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: and r3, r3, #1 +; CHECK-LE-NEXT: cset r2, ne +; CHECK-LE-NEXT: and r2, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB25_5 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB25_6 +; CHECK-LE-NEXT: .LBB25_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB25_7 +; CHECK-LE-NEXT: .LBB25_3: @ %else4 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: bmi .LBB25_8 +; CHECK-LE-NEXT: .LBB25_4: @ %else6 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB25_5: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s4, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB25_2 +; CHECK-LE-NEXT: .LBB25_6: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s0, s4 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB25_3 +; CHECK-LE-NEXT: .LBB25_7: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s5, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: bpl .LBB25_4 +; CHECK-LE-NEXT: .LBB25_8: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s0, s5 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f16_align4: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 +; CHECK-BE-NEXT: vcmp.f32 s4, #0 +; CHECK-BE-NEXT: vmov r1, s0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s5 +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q0[0], r1 +; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s6 +; CHECK-BE-NEXT: vmov.16 q0[1], r2 +; CHECK-BE-NEXT: vmov r1, s8 +; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s7 +; CHECK-BE-NEXT: vmov.16 q0[2], r1 +; CHECK-BE-NEXT: vmov r1, s8 +; CHECK-BE-NEXT: vmov.16 q0[3], r1 +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: vcmp.f32 s5, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r1, #1 +; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: cset r1, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: rsb.w r3, r1, #0 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: cset r3, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: and r3, r3, #1 +; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: rsb.w r3, r3, #0 +; CHECK-BE-NEXT: mov.w r2, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: cset r3, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r2, #1 +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: and r3, r3, #1 +; CHECK-BE-NEXT: cset r2, ne +; CHECK-BE-NEXT: and r2, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB25_5 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB25_6 +; CHECK-BE-NEXT: .LBB25_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB25_7 +; CHECK-BE-NEXT: .LBB25_3: @ %else4 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: bmi .LBB25_8 +; CHECK-BE-NEXT: .LBB25_4: @ %else6 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB25_5: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s0, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB25_2 +; CHECK-BE-NEXT: .LBB25_6: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s4, s0 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB25_3 +; CHECK-BE-NEXT: .LBB25_7: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: bpl .LBB25_4 +; CHECK-BE-NEXT: .LBB25_8: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s1 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #6] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = fcmp ogt <4 x float> %a, zeroinitializer + %trunc = fptrunc <4 x float> %a to <4 x half> + call void @llvm.masked.store.v4f16.p0v4f16(<4 x half> %trunc, <4 x half>* %dest, i32 4, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float> %a) { +; CHECK-LE-LABEL: masked_v4f16_align2: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcmp.f32 s0, #0 +; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s1 +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q1[0], r1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s2 +; CHECK-LE-NEXT: vmov.16 q1[1], r2 +; CHECK-LE-NEXT: vmov r1, s8 +; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s3 +; CHECK-LE-NEXT: vmov.16 q1[2], r1 +; CHECK-LE-NEXT: vmov r1, s8 +; CHECK-LE-NEXT: vmov.16 q1[3], r1 +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: cset r1, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: vcmp.f32 s2, #0 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: cset r3, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: and r3, r3, #1 +; CHECK-LE-NEXT: vcmp.f32 s3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: mov.w r2, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: cset r3, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r2, #1 +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: and r3, r3, #1 +; CHECK-LE-NEXT: cset r2, ne +; CHECK-LE-NEXT: and r2, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB26_5 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB26_6 +; CHECK-LE-NEXT: .LBB26_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB26_7 +; CHECK-LE-NEXT: .LBB26_3: @ %else4 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: bmi .LBB26_8 +; CHECK-LE-NEXT: .LBB26_4: @ %else6 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB26_5: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s4, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB26_2 +; CHECK-LE-NEXT: .LBB26_6: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s0, s4 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB26_3 +; CHECK-LE-NEXT: .LBB26_7: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s5, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: bpl .LBB26_4 +; CHECK-LE-NEXT: .LBB26_8: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s0, s5 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f16_align2: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 +; CHECK-BE-NEXT: vcmp.f32 s4, #0 +; CHECK-BE-NEXT: vmov r1, s0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s5 +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q0[0], r1 +; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s6 +; CHECK-BE-NEXT: vmov.16 q0[1], r2 +; CHECK-BE-NEXT: vmov r1, s8 +; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s7 +; CHECK-BE-NEXT: vmov.16 q0[2], r1 +; CHECK-BE-NEXT: vmov r1, s8 +; CHECK-BE-NEXT: vmov.16 q0[3], r1 +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: vcmp.f32 s5, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r1, #1 +; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: cset r1, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: rsb.w r3, r1, #0 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: cset r3, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: and r3, r3, #1 +; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: rsb.w r3, r3, #0 +; CHECK-BE-NEXT: mov.w r2, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: cset r3, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r2, #1 +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: and r3, r3, #1 +; CHECK-BE-NEXT: cset r2, ne +; CHECK-BE-NEXT: and r2, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB26_5 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB26_6 +; CHECK-BE-NEXT: .LBB26_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB26_7 +; CHECK-BE-NEXT: .LBB26_3: @ %else4 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: bmi .LBB26_8 +; CHECK-BE-NEXT: .LBB26_4: @ %else6 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB26_5: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s0, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB26_2 +; CHECK-BE-NEXT: .LBB26_6: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s4, s0 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB26_3 +; CHECK-BE-NEXT: .LBB26_7: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: bpl .LBB26_4 +; CHECK-BE-NEXT: .LBB26_8: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s1 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #6] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = fcmp ogt <4 x float> %a, zeroinitializer + %trunc = fptrunc <4 x float> %a to <4 x half> + call void @llvm.masked.store.v4f16.p0v4f16(<4 x half> %trunc, <4 x half>* %dest, i32 2, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float> %a) { +; CHECK-LE-LABEL: masked_v4f16_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #20 +; CHECK-LE-NEXT: sub sp, #20 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcmp.f32 s0, #0 +; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s1 +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q1[0], r1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s2 +; CHECK-LE-NEXT: vmov.16 q1[1], r2 +; CHECK-LE-NEXT: vmov r1, s8 +; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s3 +; CHECK-LE-NEXT: vmov.16 q1[2], r1 +; CHECK-LE-NEXT: vmov r1, s8 +; CHECK-LE-NEXT: vmov.16 q1[3], r1 +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: cset r1, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: vcmp.f32 s2, #0 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: cset r3, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: and r3, r3, #1 +; CHECK-LE-NEXT: vcmp.f32 s3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: mov.w r2, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: cset r3, ne +; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: it gt +; CHECK-LE-NEXT: movgt r2, #1 +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: and r3, r3, #1 +; CHECK-LE-NEXT: cset r2, ne +; CHECK-LE-NEXT: and r2, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB27_5 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB27_6 +; CHECK-LE-NEXT: .LBB27_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB27_7 +; CHECK-LE-NEXT: .LBB27_3: @ %else4 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: bmi .LBB27_8 +; CHECK-LE-NEXT: .LBB27_4: @ %else6 +; CHECK-LE-NEXT: add sp, #20 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB27_5: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s4, [sp, #12] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #12] +; CHECK-LE-NEXT: strh r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB27_2 +; CHECK-LE-NEXT: .LBB27_6: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s0, s4 +; CHECK-LE-NEXT: vstr.16 s0, [sp, #8] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #8] +; CHECK-LE-NEXT: strh r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB27_3 +; CHECK-LE-NEXT: .LBB27_7: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s5, [sp, #4] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #4] +; CHECK-LE-NEXT: strh r2, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: bpl .LBB27_4 +; CHECK-LE-NEXT: .LBB27_8: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s0, s5 +; CHECK-LE-NEXT: vstr.16 s0, [sp] +; CHECK-LE-NEXT: ldrh.w r1, [sp] +; CHECK-LE-NEXT: strh r1, [r0, #6] +; CHECK-LE-NEXT: add sp, #20 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f16_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #20 +; CHECK-BE-NEXT: sub sp, #20 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 +; CHECK-BE-NEXT: vcmp.f32 s4, #0 +; CHECK-BE-NEXT: vmov r1, s0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s5 +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q0[0], r1 +; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s6 +; CHECK-BE-NEXT: vmov.16 q0[1], r2 +; CHECK-BE-NEXT: vmov r1, s8 +; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s7 +; CHECK-BE-NEXT: vmov.16 q0[2], r1 +; CHECK-BE-NEXT: vmov r1, s8 +; CHECK-BE-NEXT: vmov.16 q0[3], r1 +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: vcmp.f32 s5, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r1, #1 +; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: cset r1, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: rsb.w r3, r1, #0 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: cset r3, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: and r3, r3, #1 +; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: rsb.w r3, r3, #0 +; CHECK-BE-NEXT: mov.w r2, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: cset r3, ne +; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-BE-NEXT: it gt +; CHECK-BE-NEXT: movgt r2, #1 +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: and r3, r3, #1 +; CHECK-BE-NEXT: cset r2, ne +; CHECK-BE-NEXT: and r2, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB27_5 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB27_6 +; CHECK-BE-NEXT: .LBB27_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB27_7 +; CHECK-BE-NEXT: .LBB27_3: @ %else4 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: bmi .LBB27_8 +; CHECK-BE-NEXT: .LBB27_4: @ %else6 +; CHECK-BE-NEXT: add sp, #20 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB27_5: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s0, [sp, #12] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #12] +; CHECK-BE-NEXT: strh r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB27_2 +; CHECK-BE-NEXT: .LBB27_6: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s4, s0 +; CHECK-BE-NEXT: vstr.16 s4, [sp, #8] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #8] +; CHECK-BE-NEXT: strh r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB27_3 +; CHECK-BE-NEXT: .LBB27_7: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s1, [sp, #4] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #4] +; CHECK-BE-NEXT: strh r2, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: bpl .LBB27_4 +; CHECK-BE-NEXT: .LBB27_8: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s1 +; CHECK-BE-NEXT: vstr.16 s0, [sp] +; CHECK-BE-NEXT: ldrh.w r1, [sp] +; CHECK-BE-NEXT: strh r1, [r0, #6] +; CHECK-BE-NEXT: add sp, #20 +; CHECK-BE-NEXT: bx lr +entry: + %c = fcmp ogt <4 x float> %a, zeroinitializer + %trunc = fptrunc <4 x float> %a to <4 x half> + call void @llvm.masked.store.v4f16.p0v4f16(<4 x half> %trunc, <4 x half>* %dest, i32 1, <4 x i1> %c) + ret void +} +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4f16.p0v4f16(<4 x half>, <4 x half>*, i32, <4 x i1>) declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)