Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -13757,6 +13757,27 @@ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); } +static SDValue CombineExtendingMaskedLoad(SDNode *Ext, SelectionDAG &DAG) { + MaskedLoadSDNode *Ld = dyn_cast(Ext->getOperand(0)); + if (!Ld) + return SDValue(); + + EVT VT = Ext->getValueType(0); + SDLoc dl(Ld); + SDValue PassThru = DAG.getNode(Ext->getOpcode(), dl, VT, Ld->getPassThru()); + ISD::LoadExtType ExtTy = Ext->getOpcode() == ISD::SIGN_EXTEND ? + ISD::SEXTLOAD : ISD::ZEXTLOAD; + + SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(), + Ld->getBasePtr(), Ld->getMask(), + PassThru, Ld->getMemoryVT(), + Ld->getMemOperand(), ExtTy, + Ld->isExpandingLoad()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ext, 0), SDValue(NewLoad.getNode(), 0)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); + return NewLoad; +} + /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, @@ -13794,9 +13815,12 @@ } } - if (ST->hasMVEIntegerOps()) + if (ST->hasMVEIntegerOps()) { if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) return NewLoad; + if (SDValue NewLoad = CombineExtendingMaskedLoad(N, DAG)) + return NewLoad; + } return SDValue(); } Index: lib/Target/ARM/ARMInstrMVE.td =================================================================== --- lib/Target/ARM/ARMInstrMVE.td +++ lib/Target/ARM/ARMInstrMVE.td @@ -5016,6 +5016,33 @@ (masked_st node:$val, node:$ptr, node:$pred), [{ return cast(N)->getAlignment() >= 2; }]>; +def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + return Ld->getExtensionType() == ISD::SEXTLOAD && + Ld->getAlignment() == 1; +}]>; + +def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + return Ld->getExtensionType() == ISD::SEXTLOAD && + Ld->getAlignment() >= 2; +}]>; + +def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + return Ld->getExtensionType() == ISD::ZEXTLOAD && + Ld->getAlignment() == 1; +}]>; + +def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + return Ld->getExtensionType() == ISD::ZEXTLOAD && + Ld->getAlignment() >= 2; +}]>; def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, node:$pred)>; @@ -5145,6 +5172,25 @@ def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; + // Extending masked loads. + def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; } // Widening/Narrowing Loads/Stores Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -495,11 +495,15 @@ if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) return false; - if (DataTy->isVectorTy()) { - // We don't yet support narrowing or widening masked loads/stores. Expand - // them for the moment. + if (auto *VecTy = dyn_cast(DataTy)) { unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); - if (VecWidth != 128) + + // We don't support extending fp types. + if (VecTy->getElementType()->isFloatingPointTy() && VecWidth != 128) + return false; + + // sext/zext integers. + if (VecWidth > 128) return false; } Index: test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -4,75 +4,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_char: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI0_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: add.w r3, r1, r0 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r4, [r3] -; CHECK-NEXT: vmovne.32 q4[0], r4 -; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r4 -; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r4 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r2, [r3, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r2 -; CHECK-NEXT: vand q5, q4, q2 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmla.u32 q4, q5, r12 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r3, r1, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrbt.u32 q4, [r3] +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q4, q3 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: @@ -125,74 +89,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_short: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI1_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI1_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q3 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r3 -; CHECK-NEXT: lsls r3, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r3 -; CHECK-NEXT: lsls r3, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r3 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r2, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r2 -; CHECK-NEXT: vmovlb.s16 q4, q3 -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrht.s32 q4, [r1] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmla.u32 q3, q4, r12 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: @@ -245,75 +174,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_uchar: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI2_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI2_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: add.w r3, r1, r0 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r4, [r3] -; CHECK-NEXT: vmovne.32 q4[0], r4 -; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r4 -; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r4 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r2, [r3, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r2 -; CHECK-NEXT: vand q5, q4, q2 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmla.u32 q4, q5, r12 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r3, r1, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrbt.u32 q4, [r3] +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q4, q3 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: @@ -366,74 +259,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_ushort: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI3_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI3_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q3 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r3 -; CHECK-NEXT: lsls r3, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r3 -; CHECK-NEXT: lsls r3, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r3 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r2, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r2 -; CHECK-NEXT: vmovlb.u16 q4, q3 -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrht.u32 q4, [r1] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmla.u32 q3, q4, r12 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: @@ -558,134 +416,66 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #72] -; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: ldr r7, [sp, #28] +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq.w .LBB5_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r6, r3 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: add.w r4, r3, r7, lsl #2 +; CHECK-NEXT: adds r5, r1, r7 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r7 +; CHECK-NEXT: cset r12, hi +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: ands r6, r4 +; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq r7, r6 -; CHECK-NEXT: lslseq.w r7, r7, #31 +; CHECK-NEXT: andeq.w r6, r5, r12 +; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 -; CHECK-NEXT: cmp r4, #3 -; CHECK-NEXT: bhs.w .LBB5_6 +; CHECK-NEXT: subs r6, r7, #1 +; CHECK-NEXT: and lr, r7, #3 +; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB5_9 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: add.w r7, r12, #3 -; CHECK-NEXT: adr r5, .LCPI5_0 -; CHECK-NEXT: bic r7, r7, #3 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: adds r6, r7, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: bic r6, r6, #3 +; CHECK-NEXT: subs r7, #1 +; CHECK-NEXT: subs r6, #4 +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: adr r6, .LCPI5_0 +; CHECK-NEXT: vldrw.u32 q1, [r6] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q1, r4 -; CHECK-NEXT: @ implicit-def: $q5 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r0, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q4[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r1, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q5[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q5[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q5[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q5[3], r5 -; CHECK-NEXT: vand q5, q5, q3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vadd.i32 q4, q4, r2 +; CHECK-NEXT: vadd.i32 q2, q1, r7 +; CHECK-NEXT: adds r4, r0, r7 +; CHECK-NEXT: vpt.u32 cs, q0, q2 +; CHECK-NEXT: vldrbt.u32 q2, [r4] +; CHECK-NEXT: adds r4, r1, r7 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q4, [r3] +; CHECK-NEXT: vldrbt.u32 q3, [r4] +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_12 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r12 +; CHECK-NEXT: sub.w r12, lr, r7 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -728,9 +518,6 @@ ; CHECK-NEXT: str r7, [r3, #4]! ; CHECK-NEXT: le lr, .LBB5_11 ; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.13: @@ -883,107 +670,41 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_short: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: beq.w .LBB6_3 -; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r5, r12, #3 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: add.w lr, r12, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: adr r5, .LCPI6_0 +; CHECK-NEXT: bic lr, lr, #3 +; CHECK-NEXT: sub.w lr, lr, #4 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 ; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: adr r4, .LCPI6_0 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB6_2: @ %vector.body +; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vcmp.u32 cs, q0, q2 -; CHECK-NEXT: @ implicit-def: $q2 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r0] -; CHECK-NEXT: vmovne.32 q2[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #2] -; CHECK-NEXT: vmovmi.32 q2[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #4] -; CHECK-NEXT: vmovmi.32 q2[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r0, #6] -; CHECK-NEXT: vmovmi.32 q2[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vadd.i32 q2, q1, r12 +; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: vptt.u32 cs, q0, q2 +; CHECK-NEXT: vldrht.s32 q2, [r0] +; CHECK-NEXT: vldrht.s32 q3, [r1] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r5 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmul.i32 q2, q3, q2 ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: vadd.i32 q2, q2, r2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: le lr, .LBB6_2 -; CHECK-NEXT: .LBB6_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: +; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI6_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 @@ -1035,134 +756,66 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #72] -; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: ldr r7, [sp, #28] +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq.w .LBB7_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r6, r3 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: add.w r4, r3, r7, lsl #2 +; CHECK-NEXT: adds r5, r1, r7 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r7 +; CHECK-NEXT: cset r12, hi +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: ands r6, r4 +; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq r7, r6 -; CHECK-NEXT: lslseq.w r7, r7, #31 +; CHECK-NEXT: andeq.w r6, r5, r12 +; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 -; CHECK-NEXT: cmp r4, #3 -; CHECK-NEXT: bhs.w .LBB7_6 +; CHECK-NEXT: subs r6, r7, #1 +; CHECK-NEXT: and lr, r7, #3 +; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB7_9 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: add.w r7, r12, #3 -; CHECK-NEXT: adr r5, .LCPI7_0 -; CHECK-NEXT: bic r7, r7, #3 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: adds r6, r7, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: bic r6, r6, #3 +; CHECK-NEXT: subs r7, #1 +; CHECK-NEXT: subs r6, #4 +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: adr r6, .LCPI7_0 +; CHECK-NEXT: vldrw.u32 q1, [r6] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q1, r4 -; CHECK-NEXT: @ implicit-def: $q5 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r0, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q4[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r1, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q5[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q5[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q5[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q5[3], r5 -; CHECK-NEXT: vand q5, q5, q3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vadd.i32 q4, q4, r2 +; CHECK-NEXT: vadd.i32 q2, q1, r7 +; CHECK-NEXT: adds r4, r0, r7 +; CHECK-NEXT: vpt.u32 cs, q0, q2 +; CHECK-NEXT: vldrbt.u32 q2, [r4] +; CHECK-NEXT: adds r4, r1, r7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q3, [r4] +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q2, q2, r2 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q4, [r3] +; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_12 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r12 +; CHECK-NEXT: sub.w r12, lr, r7 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -1205,9 +858,6 @@ ; CHECK-NEXT: str r7, [r3, #4]! ; CHECK-NEXT: le lr, .LBB7_11 ; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.13: @@ -1360,107 +1010,41 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_ushort: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: beq.w .LBB8_3 -; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r5, r12, #3 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: add.w lr, r12, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: adr r5, .LCPI8_0 +; CHECK-NEXT: bic lr, lr, #3 +; CHECK-NEXT: sub.w lr, lr, #4 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 ; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: adr r4, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB8_2: @ %vector.body +; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vcmp.u32 cs, q0, q2 -; CHECK-NEXT: @ implicit-def: $q2 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r0] -; CHECK-NEXT: vmovne.32 q2[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #2] -; CHECK-NEXT: vmovmi.32 q2[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #4] -; CHECK-NEXT: vmovmi.32 q2[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r0, #6] -; CHECK-NEXT: vmovmi.32 q2[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vmovlb.u16 q2, q2 +; CHECK-NEXT: vadd.i32 q2, q1, r12 +; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: vptt.u32 cs, q0, q2 +; CHECK-NEXT: vldrht.u32 q2, [r0] +; CHECK-NEXT: vldrht.u32 q3, [r1] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r5 -; CHECK-NEXT: vmovlb.u16 q3, q3 -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmul.i32 q2, q3, q2 ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: vadd.i32 q2, q2, r2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: le lr, .LBB8_2 -; CHECK-NEXT: .LBB8_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: +; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI8_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 Index: test/CodeGen/Thumb2/mve-masked-ldst.ll =================================================================== --- test/CodeGen/Thumb2/mve-masked-ldst.ll +++ test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -21,49 +21,11 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { ; CHECK-LABEL: foo_sext_v4i32_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #3] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -76,48 +38,11 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { ; CHECK-LABEL: foo_sext_v4i32_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r1, [r2, #6] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -130,49 +55,11 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { ; CHECK-LABEL: foo_zext_v4i32_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #3] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -185,48 +72,11 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { ; CHECK-LABEL: foo_zext_v4i32_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r1, [r2, #6] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -253,79 +103,28 @@ } define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { -; CHECK-LABEL: foo_sext_v8i16_v8i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r3, lr, #1 -; CHECK-NEXT: ubfx r1, lr, #2, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r12, #0, #1 -; CHECK-NEXT: bfi r3, r1, #1, #1 -; CHECK-NEXT: ubfx r1, lr, #4, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #2, #1 -; CHECK-NEXT: ubfx r1, lr, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #3, #1 -; CHECK-NEXT: ubfx r1, lr, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #4, #1 -; CHECK-NEXT: ubfx r1, lr, #10, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #5, #1 -; CHECK-NEXT: ubfx r1, lr, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #6, #1 -; CHECK-NEXT: ubfx r1, lr, #14, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r3 -; CHECK-NEXT: lsls r3, r3, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.16 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.16 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.16 q0[2], r3 -; CHECK-NEXT: lsls r3, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #3] -; CHECK-NEXT: vmovmi.16 q0[3], r3 -; CHECK-NEXT: lsls r3, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.16 q0[4], r3 -; CHECK-NEXT: lsls r3, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #5] -; CHECK-NEXT: vmovmi.16 q0[5], r3 -; CHECK-NEXT: lsls r3, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #6] -; CHECK-NEXT: vmovmi.16 q0[6], r3 -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #7] -; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; CHECK-LE-LABEL: foo_sext_v8i16_v8i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q1, [r1] +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpt.s16 gt, q1, zr +; CHECK-LE-NEXT: vldrbt.s16 q1, [r2] +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrht.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_sext_v8i16_v8i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vrev32.16 q0, q0 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vldrbt.s16 q1, [r2] +; CHECK-BE-NEXT: vpsel q0, q1, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrht.16 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -336,79 +135,28 @@ } define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { -; CHECK-LABEL: foo_zext_v8i16_v8i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r3, lr, #1 -; CHECK-NEXT: ubfx r1, lr, #2, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r12, #0, #1 -; CHECK-NEXT: bfi r3, r1, #1, #1 -; CHECK-NEXT: ubfx r1, lr, #4, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #2, #1 -; CHECK-NEXT: ubfx r1, lr, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #3, #1 -; CHECK-NEXT: ubfx r1, lr, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #4, #1 -; CHECK-NEXT: ubfx r1, lr, #10, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #5, #1 -; CHECK-NEXT: ubfx r1, lr, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #6, #1 -; CHECK-NEXT: ubfx r1, lr, #14, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r3 -; CHECK-NEXT: lsls r3, r3, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.16 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.16 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.16 q0[2], r3 -; CHECK-NEXT: lsls r3, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #3] -; CHECK-NEXT: vmovmi.16 q0[3], r3 -; CHECK-NEXT: lsls r3, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.16 q0[4], r3 -; CHECK-NEXT: lsls r3, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #5] -; CHECK-NEXT: vmovmi.16 q0[5], r3 -; CHECK-NEXT: lsls r3, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #6] -; CHECK-NEXT: vmovmi.16 q0[6], r3 -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #7] -; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; CHECK-LE-LABEL: foo_zext_v8i16_v8i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q1, [r1] +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpt.s16 gt, q1, zr +; CHECK-LE-NEXT: vldrbt.u16 q1, [r2] +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vpst +; CHECK-LE-NEXT: vstrht.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_zext_v8i16_v8i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vrev32.16 q0, q0 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vldrbt.u16 q1, [r2] +; CHECK-BE-NEXT: vpsel q0, q1, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrht.16 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -435,74 +183,23 @@ } define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) { -; CHECK-LABEL: foo_trunc_v8i8_v8i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vpt.s16 gt, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r2] -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r2, r1, #1 -; CHECK-NEXT: rsbs r3, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r3, #0, #1 -; CHECK-NEXT: ubfx r3, r1, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #2, #1 -; CHECK-NEXT: ubfx r3, r1, #6, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: ubfx r3, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #4, #1 -; CHECK-NEXT: ubfx r3, r1, #10, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #5, #1 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #14, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r2 -; CHECK-NEXT: lsls r2, r2, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne.u16 r2, q0[0] -; CHECK-NEXT: strbne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[1] -; CHECK-NEXT: strbmi r2, [r0, #1] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[2] -; CHECK-NEXT: strbmi r2, [r0, #2] -; CHECK-NEXT: lsls r2, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[3] -; CHECK-NEXT: strbmi r2, [r0, #3] -; CHECK-NEXT: lsls r2, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[4] -; CHECK-NEXT: strbmi r2, [r0, #4] -; CHECK-NEXT: lsls r2, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[5] -; CHECK-NEXT: strbmi r2, [r0, #5] -; CHECK-NEXT: lsls r2, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[6] -; CHECK-NEXT: strbmi r2, [r0, #6] -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r1, q0[7] -; CHECK-NEXT: strbmi r1, [r0, #7] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v8i8_v8i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q0, [r1] +; CHECK-LE-NEXT: vptt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u16 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v8i8_v8i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrht.u16 q0, [r2] +; CHECK-BE-NEXT: vrev16.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -513,45 +210,23 @@ } define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LABEL: foo_trunc_v4i8_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vpt.s32 gt, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r2] -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r1, r2, #1 -; CHECK-NEXT: rsbs r3, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r1, r3, #0, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r1, r2, #3, #1 -; CHECK-NEXT: lsls r2, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r2, s0 -; CHECK-NEXT: strbne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s1 -; CHECK-NEXT: strbmi r2, [r0, #1] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s2 -; CHECK-NEXT: strbmi r2, [r0, #2] -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: strbmi r1, [r0, #3] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v4i8_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vptt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v4i8_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -562,45 +237,23 @@ } define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LABEL: foo_trunc_v4i16_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vpt.s32 gt, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r2] -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r1, r2, #1 -; CHECK-NEXT: rsbs r3, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r1, r3, #0, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r1, r2, #3, #1 -; CHECK-NEXT: lsls r2, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r2, s0 -; CHECK-NEXT: strhne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s1 -; CHECK-NEXT: strhmi r2, [r0, #2] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s2 -; CHECK-NEXT: strhmi r2, [r0, #4] -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: strhmi r1, [r0, #6] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v4i16_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vptt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v4i16_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer