diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9345,6 +9345,35 @@ return SDValue(N, 0); // Return N so it doesn't get rechecked! } +static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG, + const TargetLowering &TLI, EVT VT, + SDNode *N, SDValue N0, + ISD::LoadExtType ExtLoadType, + ISD::NodeType ExtOpc) { + if (!N0.hasOneUse()) + return SDValue(); + + MaskedLoadSDNode *Ld = dyn_cast(N0); + if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD) + return SDValue(); + + if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0))) + return SDValue(); + + if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0))) + return SDValue(); + + SDLoc dl(Ld); + SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru()); + SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(), + Ld->getBasePtr(), Ld->getMask(), + PassThru, Ld->getMemoryVT(), + Ld->getMemOperand(), ExtLoadType, + Ld->isExpandingLoad()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); + return NewLoad; +} + static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG, bool LegalOperations) { assert((N->getOpcode() == ISD::SIGN_EXTEND || @@ -9445,6 +9474,11 @@ ISD::SEXTLOAD, ISD::SIGN_EXTEND)) return foldedExt; + if (SDValue foldedExt = + tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD, + ISD::SIGN_EXTEND)) + return foldedExt; + // fold (sext (load x)) to multiple smaller sextloads. // Only on illegal but splittable vectors. if (SDValue ExtLoad = CombineExtLoad(N)) @@ -9733,6 +9767,11 @@ ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) return foldedExt; + if (SDValue foldedExt = + tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD, + ISD::ZERO_EXTEND)) + return foldedExt; + // fold (zext (load x)) to multiple smaller zextloads. // Only on illegal but splittable vectors. if (SDValue ExtLoad = CombineExtLoad(N)) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -8898,9 +8898,13 @@ SDValue PassThru = N->getPassThru(); SDLoc dl(Op); - if (ISD::isBuildVectorAllZeros(PassThru.getNode()) || + auto IsZero = [](SDValue PassThru) { + return (ISD::isBuildVectorAllZeros(PassThru.getNode()) || (PassThru->getOpcode() == ARMISD::VMOVIMM && - isNullConstant(PassThru->getOperand(0)))) + isNullConstant(PassThru->getOperand(0)))); + }; + + if (IsZero(PassThru)) return Op; // MVE Masked loads use zero as the passthru value. Here we convert undef to @@ -8911,7 +8915,9 @@ VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(), N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); SDValue Combo = NewLoad; - if (!PassThru.isUndef()) + if (!PassThru.isUndef() && + (PassThru.getOpcode() != ISD::BITCAST || + !IsZero(PassThru->getOperand(0)))) Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); } @@ -14698,6 +14704,11 @@ if (!isTypeLegal(VT)) return false; + if (auto *Ld = dyn_cast(ExtVal.getOperand(0))) { + if (Ld->isExpandingLoad()) + return false; + } + // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5071,16 +5071,52 @@ return cast(N)->getAlignment() >= 2; }]>; -def alignedmaskedload32 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ - return cast(N)->getAlignment() >= 4; + +def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + return Ld->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + return cast(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + return cast(N)->getExtensionType() == ISD::ZEXTLOAD; +}]>; +def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; +}]>; +def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2; }]>; -def alignedmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru), [{ - return cast(N)->getAlignment() >= 2; +def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + return cast(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + return cast(N)->getExtensionType() == ISD::ZEXTLOAD; +}]>; +def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; +}]>; +def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4; }]>; -def maskedload : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), - (masked_ld node:$ptr, node:$pred, node:$passthru)>; def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, node:$pred), [{ @@ -5090,6 +5126,7 @@ (masked_st node:$val, node:$ptr, node:$pred), [{ return cast(N)->getAlignment() >= 2; }]>; + def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, node:$pred)>; @@ -5121,16 +5158,6 @@ (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - - // Unaligned masked loads - def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))), - (v4f32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))), - (v8i16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; - def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))), - (v8f16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; } let Predicates = [HasMVEInt, IsBE] in { @@ -5195,15 +5222,6 @@ (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; - // Unaligned masked loads - def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), - (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>; - def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))), - (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>; - def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))), - (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>; - def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))), - (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>; } let Predicates = [HasMVEInt] in { @@ -5214,11 +5232,39 @@ def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; // Aligned masked loads - def : MVE_vector_maskedload_typed; + def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; + // Extending masked loads. + def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; } // Widening/Narrowing Loads/Stores diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -495,16 +495,21 @@ if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) return false; - if (DataTy->isVectorTy()) { - // We don't yet support narrowing or widening masked loads/stores. Expand - // them for the moment. - unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); - if (VecWidth != 128) + if (auto *VecTy = dyn_cast(DataTy)) { + // Don't support v2i1 yet. + if (VecTy->getNumElements() == 2) + return false; + + // We don't support extending fp types. + unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); + if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) return false; } unsigned EltWidth = DataTy->getScalarSizeInBits(); - return EltWidth == 32 || EltWidth == 16 || EltWidth == 8; + return (EltWidth == 32 && (!Alignment || Alignment >= 4)) || + (EltWidth == 16 && (!Alignment || Alignment >= 2)) || + (EltWidth == 8); } int ARMTTIImpl::getMemcpyCost(const Instruction *I) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29056,6 +29056,9 @@ } bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + if (isa(ExtVal.getOperand(0))) + return false; + EVT SrcVT = ExtVal.getOperand(0).getValueType(); // There is no extending load for vXi1. diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -4,75 +4,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_char: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI0_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: add.w r3, r1, r0 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r4, [r3] -; CHECK-NEXT: vmovne.32 q4[0], r4 -; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r4 -; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r4 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r2, [r3, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r2 -; CHECK-NEXT: vand q5, q4, q2 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmla.u32 q4, q5, r12 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r3, r1, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrbt.u32 q4, [r3] +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q4, q3 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: @@ -125,74 +89,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_short: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI1_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI1_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q3 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r3 -; CHECK-NEXT: lsls r3, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r3 -; CHECK-NEXT: lsls r3, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r3 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r2, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r2 -; CHECK-NEXT: vmovlb.s16 q4, q3 -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrht.s32 q4, [r1] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmla.u32 q3, q4, r12 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: @@ -245,75 +174,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_uchar: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI2_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI2_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: add.w r3, r1, r0 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r4, [r3] -; CHECK-NEXT: vmovne.32 q4[0], r4 -; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r4 -; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r4, [r3, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r4 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r2, [r3, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r2 -; CHECK-NEXT: vand q5, q4, q2 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmla.u32 q4, q5, r12 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r3, r1, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrbt.u32 q4, [r3] +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q4, q3 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: @@ -366,74 +259,39 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: test_acc_scalar_ushort: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: it eq +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: sub.w lr, r3, #4 -; CHECK-NEXT: adr r2, .LCPI3_0 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: add.w lr, r3, lr, lsr #2 -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: adr r3, .LCPI3_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vcmp.u32 cs, q0, q3 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r4, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r4, #0, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #1, #1 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: lsls r3, r2, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r3 -; CHECK-NEXT: lsls r3, r2, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r3 -; CHECK-NEXT: lsls r3, r2, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r3 -; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r2, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r2 -; CHECK-NEXT: vmovlb.u16 q4, q3 -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vadd.i32 q4, q2, r2 +; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vldrht.u32 q4, [r1] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmla.u32 q3, q4, r12 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.u32 q0, q4, r0 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: @@ -558,134 +416,66 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #72] -; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: ldr r7, [sp, #28] +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq.w .LBB5_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r6, r3 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: add.w r4, r3, r7, lsl #2 +; CHECK-NEXT: adds r5, r1, r7 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r7 +; CHECK-NEXT: cset r12, hi +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: ands r6, r4 +; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq r7, r6 -; CHECK-NEXT: lslseq.w r7, r7, #31 +; CHECK-NEXT: andeq.w r6, r5, r12 +; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 -; CHECK-NEXT: cmp r4, #3 -; CHECK-NEXT: bhs.w .LBB5_6 +; CHECK-NEXT: subs r6, r7, #1 +; CHECK-NEXT: and lr, r7, #3 +; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB5_9 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: add.w r7, r12, #3 -; CHECK-NEXT: adr r5, .LCPI5_0 -; CHECK-NEXT: bic r7, r7, #3 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: adds r6, r7, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: bic r6, r6, #3 +; CHECK-NEXT: subs r7, #1 +; CHECK-NEXT: subs r6, #4 +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: adr r6, .LCPI5_0 +; CHECK-NEXT: vldrw.u32 q1, [r6] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q1, r4 -; CHECK-NEXT: @ implicit-def: $q5 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r0, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q4[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r1, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q5[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q5[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q5[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q5[3], r5 -; CHECK-NEXT: vand q5, q5, q3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vadd.i32 q4, q4, r2 +; CHECK-NEXT: vadd.i32 q2, q1, r7 +; CHECK-NEXT: adds r4, r0, r7 +; CHECK-NEXT: vpt.u32 cs, q0, q2 +; CHECK-NEXT: vldrbt.u32 q2, [r4] +; CHECK-NEXT: adds r4, r1, r7 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q4, [r3] +; CHECK-NEXT: vldrbt.u32 q3, [r4] +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_12 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r12 +; CHECK-NEXT: sub.w r12, lr, r7 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -728,9 +518,6 @@ ; CHECK-NEXT: str r7, [r3, #4]! ; CHECK-NEXT: le lr, .LBB5_11 ; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.13: @@ -883,107 +670,41 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_short: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: beq.w .LBB6_3 -; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r5, r12, #3 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: add.w lr, r12, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: adr r5, .LCPI6_0 +; CHECK-NEXT: bic lr, lr, #3 +; CHECK-NEXT: sub.w lr, lr, #4 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 ; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: adr r4, .LCPI6_0 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB6_2: @ %vector.body +; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vcmp.u32 cs, q0, q2 -; CHECK-NEXT: @ implicit-def: $q2 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r0] -; CHECK-NEXT: vmovne.32 q2[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #2] -; CHECK-NEXT: vmovmi.32 q2[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #4] -; CHECK-NEXT: vmovmi.32 q2[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r0, #6] -; CHECK-NEXT: vmovmi.32 q2[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vadd.i32 q2, q1, r12 +; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: vptt.u32 cs, q0, q2 +; CHECK-NEXT: vldrht.s32 q2, [r0] +; CHECK-NEXT: vldrht.s32 q3, [r1] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r5 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmul.i32 q2, q3, q2 ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: vadd.i32 q2, q2, r2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: le lr, .LBB6_2 -; CHECK-NEXT: .LBB6_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: +; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI6_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 @@ -1035,134 +756,66 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #72] -; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: ldr r7, [sp, #28] +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq.w .LBB7_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r6, r3 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: add.w r4, r3, r7, lsl #2 +; CHECK-NEXT: adds r5, r1, r7 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r7 +; CHECK-NEXT: cset r12, hi +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: ands r6, r4 +; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq r7, r6 -; CHECK-NEXT: lslseq.w r7, r7, #31 +; CHECK-NEXT: andeq.w r6, r5, r12 +; CHECK-NEXT: lslseq.w r6, r6, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 -; CHECK-NEXT: cmp r4, #3 -; CHECK-NEXT: bhs.w .LBB7_6 +; CHECK-NEXT: subs r6, r7, #1 +; CHECK-NEXT: and lr, r7, #3 +; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB7_9 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: add.w r7, r12, #3 -; CHECK-NEXT: adr r5, .LCPI7_0 -; CHECK-NEXT: bic r7, r7, #3 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: adds r6, r7, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: bic r6, r6, #3 +; CHECK-NEXT: subs r7, #1 +; CHECK-NEXT: subs r6, #4 +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: adr r6, .LCPI7_0 +; CHECK-NEXT: vldrw.u32 q1, [r6] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q1, r4 -; CHECK-NEXT: @ implicit-def: $q5 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: @ implicit-def: $q4 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r0, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q4[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q4[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q4[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q4[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: add.w r6, r1, r4 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r7, [r6] -; CHECK-NEXT: vmovne.32 q5[0], r7 -; CHECK-NEXT: lsls r7, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #1] -; CHECK-NEXT: vmovmi.32 q5[1], r7 -; CHECK-NEXT: lsls r7, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r7, [r6, #2] -; CHECK-NEXT: vmovmi.32 q5[2], r7 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r5, [r6, #3] -; CHECK-NEXT: vmovmi.32 q5[3], r5 -; CHECK-NEXT: vand q5, q5, q3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vadd.i32 q4, q4, r2 +; CHECK-NEXT: vadd.i32 q2, q1, r7 +; CHECK-NEXT: adds r4, r0, r7 +; CHECK-NEXT: vpt.u32 cs, q0, q2 +; CHECK-NEXT: vldrbt.u32 q2, [r4] +; CHECK-NEXT: adds r4, r1, r7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q3, [r4] +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q2, q2, r2 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q4, [r3] +; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_12 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r12 +; CHECK-NEXT: sub.w r12, lr, r7 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -1205,9 +858,6 @@ ; CHECK-NEXT: str r7, [r3, #4]! ; CHECK-NEXT: le lr, .LBB7_11 ; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.13: @@ -1360,107 +1010,41 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_ushort: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: beq.w .LBB8_3 -; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r5, r12, #3 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: add.w lr, r12, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: bic r5, r5, #3 -; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 -; CHECK-NEXT: adr r5, .LCPI8_0 +; CHECK-NEXT: bic lr, lr, #3 +; CHECK-NEXT: sub.w lr, lr, #4 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 ; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: adr r4, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB8_2: @ %vector.body +; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vcmp.u32 cs, q0, q2 -; CHECK-NEXT: @ implicit-def: $q2 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r0] -; CHECK-NEXT: vmovne.32 q2[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #2] -; CHECK-NEXT: vmovmi.32 q2[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r0, #4] -; CHECK-NEXT: vmovmi.32 q2[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r0, #6] -; CHECK-NEXT: vmovmi.32 q2[3], r5 -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vmovlb.u16 q2, q2 +; CHECK-NEXT: vadd.i32 q2, q1, r12 +; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: vptt.u32 cs, q0, q2 +; CHECK-NEXT: vldrht.u32 q2, [r0] +; CHECK-NEXT: vldrht.u32 q3, [r1] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: and r5, r6, #1 -; CHECK-NEXT: rsbs r7, r5, #0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: bfi r5, r7, #0, #1 -; CHECK-NEXT: ubfx r7, r6, #4, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #1, #1 -; CHECK-NEXT: ubfx r7, r6, #8, #1 -; CHECK-NEXT: ubfx r6, r6, #12, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r5, r7, #2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r5, r6, #3, #1 -; CHECK-NEXT: lsls r6, r5, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r6, [r1] -; CHECK-NEXT: vmovne.32 q3[0], r6 -; CHECK-NEXT: lsls r6, r5, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #2] -; CHECK-NEXT: vmovmi.32 q3[1], r6 -; CHECK-NEXT: lsls r6, r5, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r6, [r1, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r6 -; CHECK-NEXT: lsls r5, r5, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r5, [r1, #6] -; CHECK-NEXT: vmovmi.32 q3[3], r5 -; CHECK-NEXT: vmovlb.u16 q3, q3 -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmul.i32 q2, q3, q2 ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: vadd.i32 q2, q2, r2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: le lr, .LBB8_2 -; CHECK-NEXT: .LBB8_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: +; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI8_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -21,49 +21,11 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { ; CHECK-LABEL: foo_sext_v4i32_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #3] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -76,48 +38,11 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { ; CHECK-LABEL: foo_sext_v4i32_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r1, [r2, #6] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -130,49 +55,11 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { ; CHECK-LABEL: foo_zext_v4i32_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #3] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -185,48 +72,11 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { ; CHECK-LABEL: foo_zext_v4i32_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vcmp.s32 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r1, lr, #1 -; CHECK-NEXT: ubfx r3, lr, #4, #1 -; CHECK-NEXT: rsb.w r12, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r12, #0, #1 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, lr, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: ubfx r3, lr, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #3, #1 -; CHECK-NEXT: lsls r3, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrhne r3, [r2] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.32 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.32 q0[2], r3 -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrhmi r1, [r2, #6] -; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s32 gt, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r2] ; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -236,6 +86,636 @@ ret void } +define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) { +; CHECK-LE-LABEL: foo_sext_v2i64_v2i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r5, r7, lr} +; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 +; CHECK-LE-NEXT: movs r4, #0 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: lsls r1, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r1, [r2, #4] +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov r1, s6 +; CHECK-LE-NEXT: vmov.32 q1[0], r3 +; CHECK-LE-NEXT: rsbs r5, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: asr.w lr, r3, #31 +; CHECK-LE-NEXT: vmov.32 q1[1], lr +; CHECK-LE-NEXT: asr.w r12, r1, #31 +; CHECK-LE-NEXT: vmov.32 q1[2], r1 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: vmov.32 q1[3], r12 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r4, #1 +; CHECK-LE-NEXT: cmp r4, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r4, #1 +; CHECK-LE-NEXT: bfi r4, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r4, #3 +; CHECK-LE-NEXT: lsls r2, r4, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne d2, [r0] +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi d3, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; +; CHECK-BE-LABEL: foo_sext_v2i64_v2i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r5, r7, lr} +; CHECK-BE-NEXT: push {r4, r5, r7, lr} +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q2 +; CHECK-BE-NEXT: vmov.32 q0[3], lr +; CHECK-BE-NEXT: mov.w lr, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt.w lr, #1 +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, lr, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r3, r3, #31 +; CHECK-BE-NEXT: beq .LBB5_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: ldr r3, [r2] +; CHECK-BE-NEXT: vmov.32 q1[1], r3 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: .LBB5_2: @ %else +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: bpl .LBB5_4 +; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1 +; CHECK-BE-NEXT: ldr r1, [r2, #4] +; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: vmov.32 q0[3], r1 +; CHECK-BE-NEXT: vrev64.32 q2, q0 +; CHECK-BE-NEXT: .LBB5_4: @ %else2 +; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: vmov r2, s11 +; CHECK-BE-NEXT: movs r4, #0 +; CHECK-BE-NEXT: vmov r3, s1 +; CHECK-BE-NEXT: vmov r1, s3 +; CHECK-BE-NEXT: rsbs r5, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: vmov r2, s9 +; CHECK-BE-NEXT: asr.w lr, r3, #31 +; CHECK-BE-NEXT: vmov.32 q1[0], lr +; CHECK-BE-NEXT: asr.w r12, r1, #31 +; CHECK-BE-NEXT: vmov.32 q1[1], r3 +; CHECK-BE-NEXT: vmov.32 q1[2], r12 +; CHECK-BE-NEXT: vmov.32 q1[3], r1 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r1, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r4, #1 +; CHECK-BE-NEXT: cmp r4, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r4, #1 +; CHECK-BE-NEXT: bfi r4, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r4, #3 +; CHECK-BE-NEXT: lsls r2, r4, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne d0, [r0] +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi d1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: pop {r4, r5, r7, pc} +entry: + %0 = load <2 x i32>, <2 x i32>* %mask, align 4 + %1 = icmp sgt <2 x i32> %0, zeroinitializer + %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef) + %3 = sext <2 x i32> %2 to <2 x i64> + call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1) + ret void +} + +define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) { +; CHECK-LE-LABEL: foo_sext_v2i64_v2i32_unaligned: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r5, r7, lr} +; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 +; CHECK-LE-NEXT: movs r4, #0 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: lsls r1, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r1, [r2, #4] +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov r1, s6 +; CHECK-LE-NEXT: vmov.32 q1[0], r3 +; CHECK-LE-NEXT: rsbs r5, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: asr.w lr, r3, #31 +; CHECK-LE-NEXT: vmov.32 q1[1], lr +; CHECK-LE-NEXT: asr.w r12, r1, #31 +; CHECK-LE-NEXT: vmov.32 q1[2], r1 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: vmov.32 q1[3], r12 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r4, #1 +; CHECK-LE-NEXT: cmp r4, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r4, #1 +; CHECK-LE-NEXT: bfi r4, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r4, #3 +; CHECK-LE-NEXT: lsls r2, r4, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r2, r3, d2 +; CHECK-LE-NEXT: strdne r2, r3, [r0] +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, r2, d3 +; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; +; CHECK-BE-LABEL: foo_sext_v2i64_v2i32_unaligned: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r5, r7, lr} +; CHECK-BE-NEXT: push {r4, r5, r7, lr} +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q2 +; CHECK-BE-NEXT: vmov.32 q0[3], lr +; CHECK-BE-NEXT: mov.w lr, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt.w lr, #1 +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, lr, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r3, r3, #31 +; CHECK-BE-NEXT: beq .LBB6_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: ldr r3, [r2] +; CHECK-BE-NEXT: vmov.32 q1[1], r3 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: .LBB6_2: @ %else +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: bpl .LBB6_4 +; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1 +; CHECK-BE-NEXT: ldr r1, [r2, #4] +; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: vmov.32 q0[3], r1 +; CHECK-BE-NEXT: vrev64.32 q2, q0 +; CHECK-BE-NEXT: .LBB6_4: @ %else2 +; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: vmov r2, s11 +; CHECK-BE-NEXT: movs r4, #0 +; CHECK-BE-NEXT: vmov r3, s1 +; CHECK-BE-NEXT: vmov r1, s3 +; CHECK-BE-NEXT: rsbs r5, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: vmov r2, s9 +; CHECK-BE-NEXT: asr.w lr, r3, #31 +; CHECK-BE-NEXT: vmov.32 q1[0], lr +; CHECK-BE-NEXT: asr.w r12, r1, #31 +; CHECK-BE-NEXT: vmov.32 q1[1], r3 +; CHECK-BE-NEXT: vmov.32 q1[2], r12 +; CHECK-BE-NEXT: vmov.32 q1[3], r1 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r1, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r4, #1 +; CHECK-BE-NEXT: cmp r4, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r4, #1 +; CHECK-BE-NEXT: bfi r4, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r4, #3 +; CHECK-BE-NEXT: lsls r2, r4, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r2, r3, d0 +; CHECK-BE-NEXT: strdne r3, r2, [r0] +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, r2, d1 +; CHECK-BE-NEXT: strdmi r2, r1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: pop {r4, r5, r7, pc} +entry: + %0 = load <2 x i32>, <2 x i32>* %mask, align 4 + %1 = icmp sgt <2 x i32> %0, zeroinitializer + %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef) + %3 = sext <2 x i32> %2 to <2 x i64> + call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1) + ret void +} + +define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) { +; CHECK-LE-LABEL: foo_zext_v2i64_v2i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: adr.w r12, .LCPI7_0 +; CHECK-LE-NEXT: lsls r1, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 +; CHECK-LE-NEXT: vmov r3, s0 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r1, [r2, #4] +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vldrw.u32 q2, [r12] +; CHECK-LE-NEXT: mov.w r12, #0 +; CHECK-LE-NEXT: vand q1, q1, q2 +; CHECK-LE-NEXT: rsbs r1, r3, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-LE-NEXT: vmov r3, s2 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt.w r12, #1 +; CHECK-LE-NEXT: rsbs r1, r3, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r2, #1 +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r2, #1 +; CHECK-LE-NEXT: bfi r2, r12, #0, #1 +; CHECK-LE-NEXT: and r1, r2, #3 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne d2, [r0] +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi d3, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: .p2align 4 +; CHECK-LE-NEXT: @ %bb.1: +; CHECK-LE-NEXT: .LCPI7_0: +; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-LE-NEXT: .long 0 @ 0x0 +; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-LE-NEXT: .long 0 @ 0x0 +; +; CHECK-BE-LABEL: foo_zext_v2i64_v2i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmov.32 q0[3], lr +; CHECK-BE-NEXT: mov.w lr, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt.w lr, #1 +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, lr, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r3, r3, #31 +; CHECK-BE-NEXT: beq .LBB7_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: ldr r3, [r2] +; CHECK-BE-NEXT: vmov.32 q2[1], r3 +; CHECK-BE-NEXT: vrev64.32 q1, q2 +; CHECK-BE-NEXT: .LBB7_2: @ %else +; CHECK-BE-NEXT: vrev64.32 q2, q0 +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: bpl .LBB7_4 +; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1 +; CHECK-BE-NEXT: ldr r1, [r2, #4] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: vmov.32 q0[3], r1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: .LBB7_4: @ %else2 +; CHECK-BE-NEXT: vrev64.32 q3, q2 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: vmov r3, s15 +; CHECK-BE-NEXT: adr.w r12, .LCPI7_0 +; CHECK-BE-NEXT: vldrb.u8 q0, [r12] +; CHECK-BE-NEXT: mov.w r12, #0 +; CHECK-BE-NEXT: vrev64.8 q2, q0 +; CHECK-BE-NEXT: vand q0, q1, q2 +; CHECK-BE-NEXT: rsbs r1, r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-BE-NEXT: vmov r3, s13 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt.w r12, #1 +; CHECK-BE-NEXT: rsbs r1, r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r2, #1 +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r2, #1 +; CHECK-BE-NEXT: bfi r2, r12, #0, #1 +; CHECK-BE-NEXT: and r1, r2, #3 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne d0, [r0] +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi d1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: pop {r7, pc} +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: @ %bb.5: +; CHECK-BE-NEXT: .LCPI7_0: +; CHECK-BE-NEXT: .long 0 @ 0x0 +; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-BE-NEXT: .long 0 @ 0x0 +; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff +entry: + %0 = load <2 x i32>, <2 x i32>* %mask, align 4 + %1 = icmp sgt <2 x i32> %0, zeroinitializer + %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef) + %3 = zext <2 x i32> %2 to <2 x i64> + call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1) + ret void +} + +define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) { +; CHECK-LE-LABEL: foo_zext_v2i64_v2i32_unaligned: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: adr.w r12, .LCPI8_0 +; CHECK-LE-NEXT: lsls r1, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r1, [r2] +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 +; CHECK-LE-NEXT: vmov r3, s0 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r1, [r2, #4] +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vldrw.u32 q2, [r12] +; CHECK-LE-NEXT: mov.w r12, #0 +; CHECK-LE-NEXT: vand q1, q1, q2 +; CHECK-LE-NEXT: rsbs r1, r3, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-LE-NEXT: vmov r3, s2 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt.w r12, #1 +; CHECK-LE-NEXT: rsbs r1, r3, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r2, #1 +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r2, #1 +; CHECK-LE-NEXT: bfi r2, r12, #0, #1 +; CHECK-LE-NEXT: and r1, r2, #3 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r2, r3, d2 +; CHECK-LE-NEXT: strdne r2, r3, [r0] +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, r2, d3 +; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: .p2align 4 +; CHECK-LE-NEXT: @ %bb.1: +; CHECK-LE-NEXT: .LCPI8_0: +; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-LE-NEXT: .long 0 @ 0x0 +; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-LE-NEXT: .long 0 @ 0x0 +; +; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: ldrd r12, lr, [r1] +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: mov.w r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmov.32 q0[3], lr +; CHECK-BE-NEXT: mov.w lr, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt.w lr, #1 +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, lr, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r3, r3, #31 +; CHECK-BE-NEXT: beq .LBB8_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: ldr r3, [r2] +; CHECK-BE-NEXT: vmov.32 q2[1], r3 +; CHECK-BE-NEXT: vrev64.32 q1, q2 +; CHECK-BE-NEXT: .LBB8_2: @ %else +; CHECK-BE-NEXT: vrev64.32 q2, q0 +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: bpl .LBB8_4 +; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1 +; CHECK-BE-NEXT: ldr r1, [r2, #4] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: vmov.32 q0[3], r1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: .LBB8_4: @ %else2 +; CHECK-BE-NEXT: vrev64.32 q3, q2 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: vmov r3, s15 +; CHECK-BE-NEXT: adr.w r12, .LCPI8_0 +; CHECK-BE-NEXT: vldrb.u8 q0, [r12] +; CHECK-BE-NEXT: mov.w r12, #0 +; CHECK-BE-NEXT: vrev64.8 q2, q0 +; CHECK-BE-NEXT: vand q0, q1, q2 +; CHECK-BE-NEXT: rsbs r1, r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-BE-NEXT: vmov r3, s13 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt.w r12, #1 +; CHECK-BE-NEXT: rsbs r1, r3, #0 +; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r2, #1 +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r2, #1 +; CHECK-BE-NEXT: bfi r2, r12, #0, #1 +; CHECK-BE-NEXT: and r1, r2, #3 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r2, r3, d0 +; CHECK-BE-NEXT: strdne r3, r2, [r0] +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, r2, d1 +; CHECK-BE-NEXT: strdmi r2, r1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: pop {r7, pc} +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: @ %bb.5: +; CHECK-BE-NEXT: .LCPI8_0: +; CHECK-BE-NEXT: .long 0 @ 0x0 +; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-BE-NEXT: .long 0 @ 0x0 +; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff +entry: + %0 = load <2 x i32>, <2 x i32>* %mask, align 4 + %1 = icmp sgt <2 x i32> %0, zeroinitializer + %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef) + %3 = zext <2 x i32> %2 to <2 x i64> + call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1) + ret void +} + define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src) { ; CHECK-LABEL: foo_v8i16_v8i16: ; CHECK: @ %bb.0: @ %entry @@ -255,77 +735,11 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { ; CHECK-LABEL: foo_sext_v8i16_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r3, lr, #1 -; CHECK-NEXT: ubfx r1, lr, #2, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r12, #0, #1 -; CHECK-NEXT: bfi r3, r1, #1, #1 -; CHECK-NEXT: ubfx r1, lr, #4, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #2, #1 -; CHECK-NEXT: ubfx r1, lr, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #3, #1 -; CHECK-NEXT: ubfx r1, lr, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #4, #1 -; CHECK-NEXT: ubfx r1, lr, #10, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #5, #1 -; CHECK-NEXT: ubfx r1, lr, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #6, #1 -; CHECK-NEXT: ubfx r1, lr, #14, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r3 -; CHECK-NEXT: lsls r3, r3, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.16 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.16 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.16 q0[2], r3 -; CHECK-NEXT: lsls r3, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #3] -; CHECK-NEXT: vmovmi.16 q0[3], r3 -; CHECK-NEXT: lsls r3, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.16 q0[4], r3 -; CHECK-NEXT: lsls r3, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #5] -; CHECK-NEXT: vmovmi.16 q0[5], r3 -; CHECK-NEXT: lsls r3, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #6] -; CHECK-NEXT: vmovmi.16 q0[6], r3 -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #7] -; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s16 gt, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r2] ; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -338,77 +752,11 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { ; CHECK-LABEL: foo_zext_v8i16_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: and r3, lr, #1 -; CHECK-NEXT: ubfx r1, lr, #2, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r12, #0, #1 -; CHECK-NEXT: bfi r3, r1, #1, #1 -; CHECK-NEXT: ubfx r1, lr, #4, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #2, #1 -; CHECK-NEXT: ubfx r1, lr, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #3, #1 -; CHECK-NEXT: ubfx r1, lr, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #4, #1 -; CHECK-NEXT: ubfx r1, lr, #10, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #5, #1 -; CHECK-NEXT: ubfx r1, lr, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #6, #1 -; CHECK-NEXT: ubfx r1, lr, #14, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r3 -; CHECK-NEXT: lsls r3, r3, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrbne r3, [r2] -; CHECK-NEXT: vmovne.16 q0[0], r3 -; CHECK-NEXT: lsls r3, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #1] -; CHECK-NEXT: vmovmi.16 q0[1], r3 -; CHECK-NEXT: lsls r3, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #2] -; CHECK-NEXT: vmovmi.16 q0[2], r3 -; CHECK-NEXT: lsls r3, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #3] -; CHECK-NEXT: vmovmi.16 q0[3], r3 -; CHECK-NEXT: lsls r3, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #4] -; CHECK-NEXT: vmovmi.16 q0[4], r3 -; CHECK-NEXT: lsls r3, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #5] -; CHECK-NEXT: vmovmi.16 q0[5], r3 -; CHECK-NEXT: lsls r3, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r3, [r2, #6] -; CHECK-NEXT: vmovmi.16 q0[6], r3 -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrbmi r1, [r2, #7] -; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vptt.s16 gt, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r2] ; CHECK-NEXT: vstrht.16 q0, [r0] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -435,74 +783,23 @@ } define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) { -; CHECK-LABEL: foo_trunc_v8i8_v8i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vpt.s16 gt, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r2] -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r2, r1, #1 -; CHECK-NEXT: rsbs r3, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r3, #0, #1 -; CHECK-NEXT: ubfx r3, r1, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #2, #1 -; CHECK-NEXT: ubfx r3, r1, #6, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: ubfx r3, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #4, #1 -; CHECK-NEXT: ubfx r3, r1, #10, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #5, #1 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #14, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r2 -; CHECK-NEXT: lsls r2, r2, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne.u16 r2, q0[0] -; CHECK-NEXT: strbne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[1] -; CHECK-NEXT: strbmi r2, [r0, #1] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[2] -; CHECK-NEXT: strbmi r2, [r0, #2] -; CHECK-NEXT: lsls r2, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[3] -; CHECK-NEXT: strbmi r2, [r0, #3] -; CHECK-NEXT: lsls r2, r1, #27 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[4] -; CHECK-NEXT: strbmi r2, [r0, #4] -; CHECK-NEXT: lsls r2, r1, #26 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[5] -; CHECK-NEXT: strbmi r2, [r0, #5] -; CHECK-NEXT: lsls r2, r1, #25 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r2, q0[6] -; CHECK-NEXT: strbmi r2, [r0, #6] -; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi.u16 r1, q0[7] -; CHECK-NEXT: strbmi r1, [r0, #7] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v8i8_v8i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q0, [r1] +; CHECK-LE-NEXT: vptt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u16 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v8i8_v8i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrht.u16 q0, [r2] +; CHECK-BE-NEXT: vrev16.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer @@ -513,45 +810,23 @@ } define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LABEL: foo_trunc_v4i8_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vpt.s32 gt, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r2] -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r1, r2, #1 -; CHECK-NEXT: rsbs r3, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r1, r3, #0, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r1, r2, #3, #1 -; CHECK-NEXT: lsls r2, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r2, s0 -; CHECK-NEXT: strbne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s1 -; CHECK-NEXT: strbmi r2, [r0, #1] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s2 -; CHECK-NEXT: strbmi r2, [r0, #2] -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: strbmi r1, [r0, #3] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v4i8_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vptt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v4i8_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -562,45 +837,23 @@ } define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { -; CHECK-LABEL: foo_trunc_v4i16_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vpt.s32 gt, q0, zr -; CHECK-NEXT: vldrwt.u32 q0, [r2] -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r1, r2, #1 -; CHECK-NEXT: rsbs r3, r1, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r1, r3, #0, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #1, #1 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r1, r3, #2, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r1, r2, #3, #1 -; CHECK-NEXT: lsls r2, r1, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r2, s0 -; CHECK-NEXT: strhne r2, [r0] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s1 -; CHECK-NEXT: strhmi r2, [r0, #2] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r2, s2 -; CHECK-NEXT: strhmi r2, [r0, #4] -; CHECK-NEXT: lsls r1, r1, #28 -; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: strhmi r1, [r0, #6] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_trunc_v4i16_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: vptt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_trunc_v4i16_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrwt.u32 q0, [r2] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vpst +; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer @@ -642,6 +895,270 @@ ret void } +define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%src) { +; CHECK-LABEL: foo_v4f32_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: mov.w lr, #0 +; CHECK-NEXT: @ implicit-def: $q1 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: vmrs r3, p0 +; CHECK-NEXT: and r1, r3, #1 +; CHECK-NEXT: rsb.w r12, r1, #0 +; CHECK-NEXT: ubfx r1, r3, #4, #1 +; CHECK-NEXT: bfi lr, r12, #0, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi lr, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r3, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi lr, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r3, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi lr, r1, #3, #1 +; CHECK-NEXT: lsls.w r1, lr, #31 +; CHECK-NEXT: beq .LBB18_2 +; CHECK-NEXT: @ %bb.1: @ %cond.load +; CHECK-NEXT: vldr.16 s4, [r2] +; CHECK-NEXT: .LBB18_2: @ %else +; CHECK-NEXT: lsls.w r1, lr, #30 +; CHECK-NEXT: bpl .LBB18_6 +; CHECK-NEXT: @ %bb.3: @ %cond.load1 +; CHECK-NEXT: vldr.16 s0, [r2, #2] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: lsls.w r1, lr, #29 +; CHECK-NEXT: bmi .LBB18_7 +; CHECK-NEXT: .LBB18_4: +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: lsls.w r1, lr, #28 +; CHECK-NEXT: bmi .LBB18_8 +; CHECK-NEXT: .LBB18_5: +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: b .LBB18_9 +; CHECK-NEXT: .LBB18_6: +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: lsls.w r1, lr, #29 +; CHECK-NEXT: bpl .LBB18_4 +; CHECK-NEXT: .LBB18_7: @ %cond.load4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vldr.16 s4, [r2, #4] +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: lsls.w r1, lr, #28 +; CHECK-NEXT: bpl .LBB18_5 +; CHECK-NEXT: .LBB18_8: @ %cond.load7 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vldr.16 s0, [r2, #6] +; CHECK-NEXT: vmov.16 q1[1], r1 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov.16 q1[2], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: .LBB18_9: @ %else8 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vcvtb.f32.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vcvtb.f32.f16 s2, s5 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vcvtb.f32.f16 s1, s8 +; CHECK-NEXT: vcvtb.f32.f16 s0, s4 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: ittt ne +; CHECK-NEXT: vstrne s0, [sp, #12] +; CHECK-NEXT: ldrne r2, [sp, #12] +; CHECK-NEXT: strne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: ittt mi +; CHECK-NEXT: vstrmi s1, [sp, #8] +; CHECK-NEXT: ldrmi r2, [sp, #8] +; CHECK-NEXT: strmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: ittt mi +; CHECK-NEXT: vstrmi s2, [sp, #4] +; CHECK-NEXT: ldrmi r2, [sp, #4] +; CHECK-NEXT: strmi r2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: ittt mi +; CHECK-NEXT: vstrmi s3, [sp] +; CHECK-NEXT: ldrmi r1, [sp] +; CHECK-NEXT: strmi r1, [r0, #12] +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = load <4 x i16>, <4 x i16>* %mask, align 2 + %1 = icmp sgt <4 x i16> %0, zeroinitializer + %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef) + %3 = fpext <4 x half> %2 to <4 x float> + call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 2, <4 x i1> %1) + ret void +} + +define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%src) { +; CHECK-LABEL: foo_v4f32_v4f16_unaligned: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: mov.w lr, #0 +; CHECK-NEXT: @ implicit-def: $q1 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: vmrs r3, p0 +; CHECK-NEXT: and r1, r3, #1 +; CHECK-NEXT: rsb.w r12, r1, #0 +; CHECK-NEXT: ubfx r1, r3, #4, #1 +; CHECK-NEXT: bfi lr, r12, #0, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi lr, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r3, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi lr, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r3, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi lr, r1, #3, #1 +; CHECK-NEXT: lsls.w r1, lr, #31 +; CHECK-NEXT: beq .LBB19_2 +; CHECK-NEXT: @ %bb.1: @ %cond.load +; CHECK-NEXT: vldr.16 s4, [r2] +; CHECK-NEXT: .LBB19_2: @ %else +; CHECK-NEXT: lsls.w r1, lr, #30 +; CHECK-NEXT: bpl .LBB19_6 +; CHECK-NEXT: @ %bb.3: @ %cond.load1 +; CHECK-NEXT: vldr.16 s0, [r2, #2] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: lsls.w r1, lr, #29 +; CHECK-NEXT: bmi .LBB19_7 +; CHECK-NEXT: .LBB19_4: +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: lsls.w r1, lr, #28 +; CHECK-NEXT: bmi .LBB19_8 +; CHECK-NEXT: .LBB19_5: +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: b .LBB19_9 +; CHECK-NEXT: .LBB19_6: +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: lsls.w r1, lr, #29 +; CHECK-NEXT: bpl .LBB19_4 +; CHECK-NEXT: .LBB19_7: @ %cond.load4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vldr.16 s4, [r2, #4] +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: lsls.w r1, lr, #28 +; CHECK-NEXT: bpl .LBB19_5 +; CHECK-NEXT: .LBB19_8: @ %cond.load7 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vldr.16 s0, [r2, #6] +; CHECK-NEXT: vmov.16 q1[1], r1 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov.16 q1[2], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: .LBB19_9: @ %else8 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vcvtb.f32.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vcvtb.f32.f16 s2, s5 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vcvtb.f32.f16 s1, s8 +; CHECK-NEXT: vcvtb.f32.f16 s0, s4 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: ittt ne +; CHECK-NEXT: vstrne s0, [sp, #12] +; CHECK-NEXT: ldrne r2, [sp, #12] +; CHECK-NEXT: strne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: ittt mi +; CHECK-NEXT: vstrmi s1, [sp, #8] +; CHECK-NEXT: ldrmi r2, [sp, #8] +; CHECK-NEXT: strmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: ittt mi +; CHECK-NEXT: vstrmi s2, [sp, #4] +; CHECK-NEXT: ldrmi r2, [sp, #4] +; CHECK-NEXT: strmi r2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: ittt mi +; CHECK-NEXT: vstrmi s3, [sp] +; CHECK-NEXT: ldrmi r1, [sp] +; CHECK-NEXT: strmi r1, [r0, #12] +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = load <4 x i16>, <4 x i16>* %mask, align 2 + %1 = icmp sgt <4 x i16> %0, zeroinitializer + %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef) + %3 = fpext <4 x half> %2 to <4 x float> + call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 1, <4 x i1> %1) + ret void +} + declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) @@ -649,13 +1166,16 @@ declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <4 x half> @llvm.masked.load.v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>) declare <8 x half> @llvm.masked.load.v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) declare void @llvm.masked.store.v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) declare void @llvm.masked.store.v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) declare <4 x i16> @llvm.masked.load.v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) declare <4 x i8> @llvm.masked.load.v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -45,17 +45,82 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i32_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrbt.u8 q0, [r0] +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r1, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r2, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4i32_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vpt.s32 gt, q1, zr -; CHECK-BE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-BE-NEXT: vrev32.8 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r2, [r0] +; CHECK-BE-NEXT: vmovne.32 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.32 q1[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi.32 q1[3], r0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -85,6 +150,320 @@ ret <4 x i32> %l } +define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align2_zero(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: zext16_masked_v4i32_align2_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext16_masked_v4i32_align2_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer) + %ext = zext <4 x i16> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align2_undef(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: zext16_masked_v4i32_align2_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext16_masked_v4i32_align2_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> undef) + %ext = zext <4 x i16> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: zext16_masked_v4i32_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r1, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r2, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #6] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext16_masked_v4i32_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r2, [r0] +; CHECK-BE-NEXT: vmovne.32 q0[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #6] +; CHECK-BE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-BE-NEXT: vmovlb.s16 q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 1, <4 x i1> %c, <4 x i16> undef) + %ext = sext <4 x i16> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align2_other(<4 x i16> *%dest, <4 x i16> %a) { +; CHECK-LE-LABEL: zext16_masked_v4i32_align2_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.u16 q1, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u32 q0, [r0] +; CHECK-LE-NEXT: vpsel q0, q0, q1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext16_masked_v4i32_align2_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmovlb.u16 q0, q1 +; CHECK-BE-NEXT: vmovlb.s16 q1, q1 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u32 q1, [r0] +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i16> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> %a) + %ext = zext <4 x i16> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align2_zero(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: sext16_masked_v4i32_align2_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.s32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext16_masked_v4i32_align2_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.s32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer) + %sext = sext <4 x i16> %l to <4 x i32> + ret <4 x i32> %sext +} + +define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align2_undef(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: sext16_masked_v4i32_align2_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.s32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext16_masked_v4i32_align2_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.s32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> undef) + %sext = sext <4 x i16> %l to <4 x i32> + ret <4 x i32> %sext +} + +define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: sext16_masked_v4i32_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r1, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r2, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #6] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext16_masked_v4i32_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r2, [r0] +; CHECK-BE-NEXT: vmovne.32 q0[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #6] +; CHECK-BE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-BE-NEXT: vmovlb.s16 q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 1, <4 x i1> %c, <4 x i16> undef) + %sext = sext <4 x i16> %l to <4 x i32> + ret <4 x i32> %sext +} + +define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align2_other(<4 x i16> *%dest, <4 x i16> %a) { +; CHECK-LE-LABEL: sext16_masked_v4i32_align2_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.s32 q1, [r0] +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext16_masked_v4i32_align2_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmovlb.s16 q0, q1 +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrht.s32 q1, [r0] +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i16> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> %a) + %sext = sext <4 x i16> %l to <4 x i32> + ret <4 x i32> %sext +} + define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i32_preinc: ; CHECK-LE: @ %bb.0: @ %entry @@ -139,25 +518,18 @@ ret i8* %z } - - define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_zero(<8 x i16> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vldrht.u16 q0, [r0] -; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_align4_zero: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov.i32 q1, #0x0 -; CHECK-BE-NEXT: vrev64.16 q2, q0 -; CHECK-BE-NEXT: vrev32.16 q1, q1 -; CHECK-BE-NEXT: vpt.s16 gt, q2, zr -; CHECK-BE-NEXT: vldrht.u16 q0, [r0] -; CHECK-BE-NEXT: vpsel q1, q0, q1 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u16 q1, [r0] ; CHECK-BE-NEXT: vrev64.16 q0, q1 ; CHECK-BE-NEXT: bx lr entry: @@ -166,14 +538,14 @@ ret <8 x i16> %l } -define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_undef(<8 x i16> *%dest, <8 x i16> %a) { -; CHECK-LE-LABEL: masked_v8i16_align4_undef: +define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align2_undef(<8 x i16> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_align2_undef: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vldrht.u16 q0, [r0] ; CHECK-LE-NEXT: bx lr ; -; CHECK-BE-LABEL: masked_v8i16_align4_undef: +; CHECK-BE-LABEL: masked_v8i16_align2_undef: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q1, zr @@ -189,17 +561,140 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrbt.u8 q0, [r0] +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r2, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r2, [r0] +; CHECK-LE-NEXT: vmovne.16 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #8 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-BE-NEXT: vrev16.8 q1, q0 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r2, [r0] +; CHECK-BE-NEXT: vmovne.16 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-BE-NEXT: vmovmi.16 q1[7], r0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <8 x i16> %a, zeroinitializer @@ -229,6 +724,308 @@ ret <8 x i16> %l } +define arm_aapcs_vfpcc <8 x i16> @sext8_masked_v8i16_align1_zero(<8 x i8> *%dest, <8 x i8> %a) { +; CHECK-LE-LABEL: sext8_masked_v8i16_align1_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.s16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext8_masked_v8i16_align1_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.s16 q1, [r0] +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i8> %a, zeroinitializer + %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> zeroinitializer) + %ext = sext <8 x i8> %l to <8 x i16> + ret <8 x i16> %ext +} + +define arm_aapcs_vfpcc <8 x i16> @sext8_masked_v8i16_align1_undef(<8 x i8> *%dest, <8 x i8> %a) { +; CHECK-LE-LABEL: sext8_masked_v8i16_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.s16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext8_masked_v8i16_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.s16 q1, [r0] +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i8> %a, zeroinitializer + %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> undef) + %ext = sext <8 x i8> %l to <8 x i16> + ret <8 x i16> %ext +} + +define arm_aapcs_vfpcc <8 x i16> @sext8_masked_v8i16_align1_other(<8 x i8> *%dest, <8 x i8> %a) { +; CHECK-LE-LABEL: sext8_masked_v8i16_align1_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.s16 q1, [r0] +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext8_masked_v8i16_align1_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.s16 q1, [r0] +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i8> %a, zeroinitializer + %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> %a) + %ext = sext <8 x i8> %l to <8 x i16> + ret <8 x i16> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @sext8_masked_v4i32_align1_zero(<4 x i8> *%dest, <4 x i8> %a) { +; CHECK-LE-LABEL: sext8_masked_v4i32_align1_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.s32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext8_masked_v4i32_align1_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vmovlb.s16 q0, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.s32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i8> %a, zeroinitializer + %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> zeroinitializer) + %ext = sext <4 x i8> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @sext8_masked_v4i32_align1_undef(<4 x i8> *%dest, <4 x i8> %a) { +; CHECK-LE-LABEL: sext8_masked_v4i32_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.s32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext8_masked_v4i32_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vmovlb.s16 q0, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.s32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i8> %a, zeroinitializer + %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> undef) + %ext = sext <4 x i8> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @sext8_masked_v4i32_align1_other(<4 x i8> *%dest, <4 x i8> %a) { +; CHECK-LE-LABEL: sext8_masked_v4i32_align1_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.s32 q1, [r0] +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: sext8_masked_v4i32_align1_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vmovlb.s16 q0, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.s32 q1, [r0] +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i8> %a, zeroinitializer + %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> %a) + %ext = sext <4 x i8> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @zext8_masked_v4i32_align1_zero(<4 x i8> *%dest, <4 x i8> %a) { +; CHECK-LE-LABEL: zext8_masked_v4i32_align1_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext8_masked_v4i32_align1_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vmovlb.s16 q0, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.u32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i8> %a, zeroinitializer + %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> zeroinitializer) + %ext = zext <4 x i8> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @zext8_masked_v4i32_align1_undef(<4 x i8> *%dest, <4 x i8> %a) { +; CHECK-LE-LABEL: zext8_masked_v4i32_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext8_masked_v4i32_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vmovlb.s16 q0, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.u32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i8> %a, zeroinitializer + %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> undef) + %ext = zext <4 x i8> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @zext8_masked_v4i32_align1_other(<4 x i8> *%dest, <4 x i8> %a) { +; CHECK-LE-LABEL: zext8_masked_v4i32_align1_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmov.i32 q1, #0xff +; CHECK-LE-NEXT: vand q1, q0, q1 +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vmovlb.s16 q0, q0 +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u32 q0, [r0] +; CHECK-LE-NEXT: vpsel q0, q0, q1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext8_masked_v4i32_align1_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vmov.i32 q1, #0xff +; CHECK-BE-NEXT: vrev64.32 q2, q0 +; CHECK-BE-NEXT: vand q0, q2, q1 +; CHECK-BE-NEXT: vmovlb.s8 q1, q2 +; CHECK-BE-NEXT: vmovlb.s16 q1, q1 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrbt.u32 q1, [r0] +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i8> %a, zeroinitializer + %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> %a) + %ext = zext <4 x i8> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_zero(<8 x i8> *%dest, <8 x i8> %a) { +; CHECK-LE-LABEL: zext8_masked_v8i16_align1_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext8_masked_v8i16_align1_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.u16 q1, [r0] +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i8> %a, zeroinitializer + %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> zeroinitializer) + %ext = zext <8 x i8> %l to <8 x i16> + ret <8 x i16> %ext +} + +define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_undef(<8 x i8> *%dest, <8 x i8> %a) { +; CHECK-LE-LABEL: zext8_masked_v8i16_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext8_masked_v8i16_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.u16 q1, [r0] +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i8> %a, zeroinitializer + %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> undef) + %ext = zext <8 x i8> %l to <8 x i16> + ret <8 x i16> %ext +} + +define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_other(<8 x i8> *%dest, <8 x i8> %a) { +; CHECK-LE-LABEL: zext8_masked_v8i16_align1_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmovlb.u8 q1, q0 +; CHECK-LE-NEXT: vmovlb.s8 q0, q0 +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u16 q0, [r0] +; CHECK-LE-NEXT: vpsel q0, q0, q1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: zext8_masked_v8i16_align1_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vmovlb.u8 q0, q1 +; CHECK-BE-NEXT: vmovlb.s8 q1, q1 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vldrbt.u16 q1, [r0] +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i8> %a, zeroinitializer + %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> %a) + %ext = zext <8 x i8> %l to <8 x i16> + ret <8 x i16> %ext +} + define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_preinc: ; CHECK-LE: @ %bb.0: @ %entry @@ -291,20 +1088,15 @@ define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_zero(<16 x i8> *%dest, <16 x i8> %a) { ; CHECK-LE-LABEL: masked_v16i8_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr ; CHECK-LE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v16i8_align4_zero: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov.i32 q1, #0x0 -; CHECK-BE-NEXT: vrev64.8 q2, q0 -; CHECK-BE-NEXT: vrev32.8 q1, q1 -; CHECK-BE-NEXT: vpt.s8 gt, q2, zr -; CHECK-BE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-BE-NEXT: vpsel q1, q0, q1 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vpt.s8 gt, q1, zr +; CHECK-BE-NEXT: vldrbt.u8 q1, [r0] ; CHECK-BE-NEXT: vrev64.8 q0, q1 ; CHECK-BE-NEXT: bx lr entry: @@ -413,19 +1205,15 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_zero(<4 x float> *%dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4f32_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vldrwt.u32 q0, [r0] -; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f32_align4_zero: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov.i32 q1, #0x0 -; CHECK-BE-NEXT: vrev64.32 q2, q0 -; CHECK-BE-NEXT: vpt.s32 gt, q2, zr -; CHECK-BE-NEXT: vldrwt.u32 q0, [r0] -; CHECK-BE-NEXT: vpsel q1, q0, q1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrwt.u32 q1, [r0] ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr entry: @@ -457,17 +1245,82 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4f32_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vldrbt.u8 q0, [r0] +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r1, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r2, [r0] +; CHECK-LE-NEXT: vmovne s0, r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi s1, r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi s2, r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi s3, r0 +; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f32_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vpt.s32 gt, q1, zr -; CHECK-BE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-BE-NEXT: vrev32.8 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r2, [r0] +; CHECK-BE-NEXT: vmovne s4, r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi s5, r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi s6, r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi s7, r0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -556,20 +1409,15 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_zero(<8 x half> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8f16_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vldrht.u16 q0, [r0] -; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_align4_zero: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov.i32 q1, #0x0 -; CHECK-BE-NEXT: vrev64.16 q2, q0 -; CHECK-BE-NEXT: vrev32.16 q1, q1 -; CHECK-BE-NEXT: vpt.s16 gt, q2, zr -; CHECK-BE-NEXT: vldrht.u16 q0, [r0] -; CHECK-BE-NEXT: vpsel q1, q0, q1 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u16 q1, [r0] ; CHECK-BE-NEXT: vrev64.16 q0, q1 ; CHECK-BE-NEXT: bx lr entry: @@ -601,18 +1449,248 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8f16_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrbt.u8 q0, [r0] +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r2, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: bne .LBB45_9 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB45_10 +; CHECK-LE-NEXT: .LBB45_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB45_11 +; CHECK-LE-NEXT: .LBB45_3: @ %else5 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB45_12 +; CHECK-LE-NEXT: .LBB45_4: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB45_13 +; CHECK-LE-NEXT: .LBB45_5: @ %else11 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB45_14 +; CHECK-LE-NEXT: .LBB45_6: @ %else14 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB45_15 +; CHECK-LE-NEXT: .LBB45_7: @ %else17 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB45_16 +; CHECK-LE-NEXT: .LBB45_8: @ %else20 +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB45_9: @ %cond.load +; CHECK-LE-NEXT: ldrh r2, [r0] +; CHECK-LE-NEXT: strh.w r2, [sp, #28] +; CHECK-LE-NEXT: vldr.16 s0, [sp, #28] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB45_2 +; CHECK-LE-NEXT: .LBB45_10: @ %cond.load1 +; CHECK-LE-NEXT: ldrh r2, [r0, #2] +; CHECK-LE-NEXT: strh.w r2, [sp, #24] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #24] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB45_3 +; CHECK-LE-NEXT: .LBB45_11: @ %cond.load4 +; CHECK-LE-NEXT: ldrh r2, [r0, #4] +; CHECK-LE-NEXT: strh.w r2, [sp, #20] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #20] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB45_4 +; CHECK-LE-NEXT: .LBB45_12: @ %cond.load7 +; CHECK-LE-NEXT: ldrh r2, [r0, #6] +; CHECK-LE-NEXT: strh.w r2, [sp, #16] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #16] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB45_5 +; CHECK-LE-NEXT: .LBB45_13: @ %cond.load10 +; CHECK-LE-NEXT: ldrh r2, [r0, #8] +; CHECK-LE-NEXT: strh.w r2, [sp, #12] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #12] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB45_6 +; CHECK-LE-NEXT: .LBB45_14: @ %cond.load13 +; CHECK-LE-NEXT: ldrh r2, [r0, #10] +; CHECK-LE-NEXT: strh.w r2, [sp, #8] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #8] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB45_7 +; CHECK-LE-NEXT: .LBB45_15: @ %cond.load16 +; CHECK-LE-NEXT: ldrh r2, [r0, #12] +; CHECK-LE-NEXT: strh.w r2, [sp, #4] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #4] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB45_8 +; CHECK-LE-NEXT: .LBB45_16: @ %cond.load19 +; CHECK-LE-NEXT: ldrh r0, [r0, #14] +; CHECK-LE-NEXT: strh.w r0, [sp] +; CHECK-LE-NEXT: vldr.16 s4, [sp] +; CHECK-LE-NEXT: vmov r0, s4 +; CHECK-LE-NEXT: vmov.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #40 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vldrbt.u8 q0, [r0] -; CHECK-BE-NEXT: vrev16.8 q1, q0 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: bne .LBB45_10 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB45_11 +; CHECK-BE-NEXT: .LBB45_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB45_12 +; CHECK-BE-NEXT: .LBB45_3: @ %else5 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB45_13 +; CHECK-BE-NEXT: .LBB45_4: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB45_14 +; CHECK-BE-NEXT: .LBB45_5: @ %else11 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB45_15 +; CHECK-BE-NEXT: .LBB45_6: @ %else14 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB45_16 +; CHECK-BE-NEXT: .LBB45_7: @ %else17 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB45_9 +; CHECK-BE-NEXT: .LBB45_8: @ %cond.load19 +; CHECK-BE-NEXT: ldrh r0, [r0, #14] +; CHECK-BE-NEXT: strh.w r0, [sp] +; CHECK-BE-NEXT: vldr.16 s0, [sp] +; CHECK-BE-NEXT: vmov r0, s0 +; CHECK-BE-NEXT: vmov.16 q1[7], r0 +; CHECK-BE-NEXT: .LBB45_9: @ %else20 ; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #40 ; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB45_10: @ %cond.load +; CHECK-BE-NEXT: ldrh r2, [r0] +; CHECK-BE-NEXT: strh.w r2, [sp, #28] +; CHECK-BE-NEXT: vldr.16 s4, [sp, #28] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB45_2 +; CHECK-BE-NEXT: .LBB45_11: @ %cond.load1 +; CHECK-BE-NEXT: ldrh r2, [r0, #2] +; CHECK-BE-NEXT: strh.w r2, [sp, #24] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #24] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB45_3 +; CHECK-BE-NEXT: .LBB45_12: @ %cond.load4 +; CHECK-BE-NEXT: ldrh r2, [r0, #4] +; CHECK-BE-NEXT: strh.w r2, [sp, #20] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #20] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB45_4 +; CHECK-BE-NEXT: .LBB45_13: @ %cond.load7 +; CHECK-BE-NEXT: ldrh r2, [r0, #6] +; CHECK-BE-NEXT: strh.w r2, [sp, #16] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #16] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB45_5 +; CHECK-BE-NEXT: .LBB45_14: @ %cond.load10 +; CHECK-BE-NEXT: ldrh r2, [r0, #8] +; CHECK-BE-NEXT: strh.w r2, [sp, #12] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #12] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB45_6 +; CHECK-BE-NEXT: .LBB45_15: @ %cond.load13 +; CHECK-BE-NEXT: ldrh r2, [r0, #10] +; CHECK-BE-NEXT: strh.w r2, [sp, #8] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #8] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB45_7 +; CHECK-BE-NEXT: .LBB45_16: @ %cond.load16 +; CHECK-BE-NEXT: ldrh r2, [r0, #12] +; CHECK-BE-NEXT: strh.w r2, [sp, #4] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #4] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB45_8 +; CHECK-BE-NEXT: b .LBB45_9 entry: %c = icmp sgt <8 x i16> %a, zeroinitializer %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 1, <8 x i1> %c, <8 x half> undef) @@ -722,14 +1800,14 @@ ; CHECK-LE-NEXT: bfi r2, r1, #0, #1 ; CHECK-LE-NEXT: and r1, r2, #3 ; CHECK-LE-NEXT: lsls r2, r2, #31 -; CHECK-LE-NEXT: beq .LBB29_2 +; CHECK-LE-NEXT: beq .LBB49_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load -; CHECK-LE-NEXT: vldr d1, .LCPI29_0 +; CHECK-LE-NEXT: vldr d1, .LCPI49_0 ; CHECK-LE-NEXT: vldr d0, [r0] -; CHECK-LE-NEXT: b .LBB29_3 -; CHECK-LE-NEXT: .LBB29_2: +; CHECK-LE-NEXT: b .LBB49_3 +; CHECK-LE-NEXT: .LBB49_2: ; CHECK-LE-NEXT: vmov.i32 q0, #0x0 -; CHECK-LE-NEXT: .LBB29_3: @ %else +; CHECK-LE-NEXT: .LBB49_3: @ %else ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] @@ -737,7 +1815,7 @@ ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: -; CHECK-LE-NEXT: .LCPI29_0: +; CHECK-LE-NEXT: .LCPI49_0: ; CHECK-LE-NEXT: .long 0 @ double 0 ; CHECK-LE-NEXT: .long 0 ; @@ -766,15 +1844,15 @@ ; CHECK-BE-NEXT: bfi r2, r1, #0, #1 ; CHECK-BE-NEXT: and r1, r2, #3 ; CHECK-BE-NEXT: lsls r2, r2, #31 -; CHECK-BE-NEXT: beq .LBB29_2 +; CHECK-BE-NEXT: beq .LBB49_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load -; CHECK-BE-NEXT: vldr d1, .LCPI29_0 +; CHECK-BE-NEXT: vldr d1, .LCPI49_0 ; CHECK-BE-NEXT: vldr d0, [r0] -; CHECK-BE-NEXT: b .LBB29_3 -; CHECK-BE-NEXT: .LBB29_2: +; CHECK-BE-NEXT: b .LBB49_3 +; CHECK-BE-NEXT: .LBB49_2: ; CHECK-BE-NEXT: vmov.i32 q1, #0x0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: .LBB29_3: @ %else +; CHECK-BE-NEXT: .LBB49_3: @ %else ; CHECK-BE-NEXT: lsls r1, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vldrmi d1, [r0, #8] @@ -782,7 +1860,7 @@ ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.4: -; CHECK-BE-NEXT: .LCPI29_0: +; CHECK-BE-NEXT: .LCPI49_0: ; CHECK-BE-NEXT: .long 0 @ double 0 ; CHECK-BE-NEXT: .long 0 entry: @@ -816,14 +1894,14 @@ ; CHECK-LE-NEXT: bfi r2, r1, #0, #1 ; CHECK-LE-NEXT: and r1, r2, #3 ; CHECK-LE-NEXT: lsls r2, r2, #31 -; CHECK-LE-NEXT: beq .LBB30_2 +; CHECK-LE-NEXT: beq .LBB50_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load -; CHECK-LE-NEXT: vldr d1, .LCPI30_0 +; CHECK-LE-NEXT: vldr d1, .LCPI50_0 ; CHECK-LE-NEXT: vldr d0, [r0] -; CHECK-LE-NEXT: b .LBB30_3 -; CHECK-LE-NEXT: .LBB30_2: +; CHECK-LE-NEXT: b .LBB50_3 +; CHECK-LE-NEXT: .LBB50_2: ; CHECK-LE-NEXT: vmov.i32 q0, #0x0 -; CHECK-LE-NEXT: .LBB30_3: @ %else +; CHECK-LE-NEXT: .LBB50_3: @ %else ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] @@ -831,7 +1909,7 @@ ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: -; CHECK-LE-NEXT: .LCPI30_0: +; CHECK-LE-NEXT: .LCPI50_0: ; CHECK-LE-NEXT: .long 0 @ double 0 ; CHECK-LE-NEXT: .long 0 ; @@ -860,15 +1938,15 @@ ; CHECK-BE-NEXT: bfi r2, r1, #0, #1 ; CHECK-BE-NEXT: and r1, r2, #3 ; CHECK-BE-NEXT: lsls r2, r2, #31 -; CHECK-BE-NEXT: beq .LBB30_2 +; CHECK-BE-NEXT: beq .LBB50_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load -; CHECK-BE-NEXT: vldr d1, .LCPI30_0 +; CHECK-BE-NEXT: vldr d1, .LCPI50_0 ; CHECK-BE-NEXT: vldr d0, [r0] -; CHECK-BE-NEXT: b .LBB30_3 -; CHECK-BE-NEXT: .LBB30_2: +; CHECK-BE-NEXT: b .LBB50_3 +; CHECK-BE-NEXT: .LBB50_2: ; CHECK-BE-NEXT: vmov.i32 q1, #0x0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: .LBB30_3: @ %else +; CHECK-BE-NEXT: .LBB50_3: @ %else ; CHECK-BE-NEXT: lsls r1, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vldrmi d1, [r0, #8] @@ -876,7 +1954,7 @@ ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.4: -; CHECK-BE-NEXT: .LCPI30_0: +; CHECK-BE-NEXT: .LCPI50_0: ; CHECK-BE-NEXT: .long 0 @ double 0 ; CHECK-BE-NEXT: .long 0 entry: @@ -885,10 +1963,254 @@ ret <2 x double> %l } +define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: anyext_v4i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: anyext_v4i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer) + ret <4 x i16> %l +} + +define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: anyext_v4i16_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov.w r12, #0 +; CHECK-LE-NEXT: vmrs r3, p0 +; CHECK-LE-NEXT: and r1, r3, #1 +; CHECK-LE-NEXT: rsbs r2, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r3, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r3, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r3, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: beq .LBB52_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: ldrh r2, [r0] +; CHECK-LE-NEXT: vdup.32 q0, r12 +; CHECK-LE-NEXT: vmov.32 q0[0], r2 +; CHECK-LE-NEXT: b .LBB52_3 +; CHECK-LE-NEXT: .LBB52_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: .LBB52_3: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #6] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: anyext_v4i16_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov.w r12, #0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vmrs r3, p0 +; CHECK-BE-NEXT: and r1, r3, #1 +; CHECK-BE-NEXT: rsbs r2, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r3, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r3, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r3, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: beq .LBB52_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: ldrh r2, [r0] +; CHECK-BE-NEXT: vdup.32 q1, r12 +; CHECK-BE-NEXT: vmov.32 q1[0], r2 +; CHECK-BE-NEXT: b .LBB52_3 +; CHECK-BE-NEXT: .LBB52_2: +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: .LBB52_3: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.32 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q1[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #6] +; CHECK-BE-NEXT: vmovmi.32 q1[3], r0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 1, <4 x i1> %c, <4 x i16> zeroinitializer) + ret <4 x i16> %l +} + +define arm_aapcs_vfpcc <4 x i8> @anyext_v4i8(<4 x i8> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: anyext_v4i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: anyext_v4i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrbt.u32 q1, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> zeroinitializer) + ret <4 x i8> %l +} + +define arm_aapcs_vfpcc <8 x i8> @anyext_v8i8(<8 x i8> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: anyext_v8i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vpt.s16 gt, q0, zr +; CHECK-LE-NEXT: vldrbt.u16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: anyext_v8i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vpt.s16 gt, q1, zr +; CHECK-BE-NEXT: vldrbt.u16 q1, [r0] +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> zeroinitializer) + ret <8 x i8> %l +} + +define arm_aapcs_vfpcc <4 x i32> @multi_user_zext(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: multi_user_zext: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u32 q4, [r0] +; CHECK-LE-NEXT: vmov r0, r1, d8 +; CHECK-LE-NEXT: vmov r2, r3, d9 +; CHECK-LE-NEXT: bl foo +; CHECK-LE-NEXT: vmovlb.u16 q0, q4 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: multi_user_zext: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u32 q4, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q4 +; CHECK-BE-NEXT: vmov r1, r0, d0 +; CHECK-BE-NEXT: vmov r3, r2, d1 +; CHECK-BE-NEXT: bl foo +; CHECK-BE-NEXT: vmovlb.u16 q1, q4 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer) + call void @foo(<4 x i16> %l) + %ext = zext <4 x i16> %l to <4 x i32> + ret <4 x i32> %ext +} + +define arm_aapcs_vfpcc <4 x i32> @multi_user_sext(<4 x i16> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: multi_user_sext: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: vpt.s32 gt, q0, zr +; CHECK-LE-NEXT: vldrht.u32 q4, [r0] +; CHECK-LE-NEXT: vmov r0, r1, d8 +; CHECK-LE-NEXT: vmov r2, r3, d9 +; CHECK-LE-NEXT: bl foo +; CHECK-LE-NEXT: vmovlb.s16 q0, q4 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: multi_user_sext: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpt.s32 gt, q1, zr +; CHECK-BE-NEXT: vldrht.u32 q4, [r0] +; CHECK-BE-NEXT: vrev64.32 q0, q4 +; CHECK-BE-NEXT: vmov r1, r0, d0 +; CHECK-BE-NEXT: vmov r3, r2, d1 +; CHECK-BE-NEXT: bl foo +; CHECK-BE-NEXT: vmovlb.s16 q1, q4 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer) + call void @foo(<4 x i16> %l) + %ext = sext <4 x i16> %l to <4 x i32> + ret <4 x i32> %ext +} + +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>) declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @foo(<4 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -24,16 +24,79 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i32_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.s32 gt, q0, zr -; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r1, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r2, s0 +; CHECK-LE-NEXT: strne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s1 +; CHECK-LE-NEXT: strmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s2 +; CHECK-LE-NEXT: strmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s3 +; CHECK-LE-NEXT: strmi r1, [r0, #12] +; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4i32_align1: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vrev32.8 q0, q1 -; CHECK-BE-NEXT: vpt.s32 gt, q1, zr -; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r2, s4 +; CHECK-BE-NEXT: strne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: strmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: strmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s7 +; CHECK-BE-NEXT: strmi r1, [r0, #12] +; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -126,16 +189,137 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r2, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] +; CHECK-LE-NEXT: strhne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-LE-NEXT: strhmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-LE-NEXT: strhmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-LE-NEXT: strhmi r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-LE-NEXT: strhmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-LE-NEXT: strhmi r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-LE-NEXT: strhmi r2, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-LE-NEXT: strhmi r1, [r0, #14] +; CHECK-LE-NEXT: add sp, #8 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_align1: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: vrev16.8 q0, q1 -; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u16 r2, q1[0] +; CHECK-BE-NEXT: strhne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[1] +; CHECK-BE-NEXT: strhmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[2] +; CHECK-BE-NEXT: strhmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[3] +; CHECK-BE-NEXT: strhmi r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[4] +; CHECK-BE-NEXT: strhmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[5] +; CHECK-BE-NEXT: strhmi r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[6] +; CHECK-BE-NEXT: strhmi r2, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q1[7] +; CHECK-BE-NEXT: strhmi r1, [r0, #14] +; CHECK-BE-NEXT: add sp, #8 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <8 x i16> %a, zeroinitializer @@ -311,17 +495,88 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) { ; CHECK-LE-LABEL: masked_v4f32_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.i32 ne, q1, zr -; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: .pad #20 +; CHECK-LE-NEXT: sub sp, #20 +; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: ittt ne +; CHECK-LE-NEXT: vstrne s0, [sp, #12] +; CHECK-LE-NEXT: ldrne r2, [sp, #12] +; CHECK-LE-NEXT: strne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: ittt mi +; CHECK-LE-NEXT: vstrmi s1, [sp, #8] +; CHECK-LE-NEXT: ldrmi r2, [sp, #8] +; CHECK-LE-NEXT: strmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: ittt mi +; CHECK-LE-NEXT: vstrmi s2, [sp, #4] +; CHECK-LE-NEXT: ldrmi r2, [sp, #4] +; CHECK-LE-NEXT: strmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: ittt mi +; CHECK-LE-NEXT: vstrmi s3, [sp] +; CHECK-LE-NEXT: ldrmi r1, [sp] +; CHECK-LE-NEXT: strmi r1, [r0, #12] +; CHECK-LE-NEXT: add sp, #20 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f32_align1: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #20 +; CHECK-BE-NEXT: sub sp, #20 ; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vrev32.8 q0, q1 -; CHECK-BE-NEXT: vpt.i32 ne, q2, zr -; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: ittt ne +; CHECK-BE-NEXT: vstrne s4, [sp, #12] +; CHECK-BE-NEXT: ldrne r2, [sp, #12] +; CHECK-BE-NEXT: strne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: ittt mi +; CHECK-BE-NEXT: vstrmi s5, [sp, #8] +; CHECK-BE-NEXT: ldrmi r2, [sp, #8] +; CHECK-BE-NEXT: strmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: ittt mi +; CHECK-BE-NEXT: vstrmi s6, [sp, #4] +; CHECK-BE-NEXT: ldrmi r2, [sp, #4] +; CHECK-BE-NEXT: strmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: ittt mi +; CHECK-BE-NEXT: vstrmi s7, [sp] +; CHECK-BE-NEXT: ldrmi r1, [sp] +; CHECK-BE-NEXT: strmi r1, [r0, #12] +; CHECK-BE-NEXT: add sp, #20 ; CHECK-BE-NEXT: bx lr entry: %c = icmp ugt <4 x i32> %b, zeroinitializer @@ -415,17 +670,226 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) { ; CHECK-LE-LABEL: masked_v8f16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vpt.i16 ne, q1, zr -; CHECK-LE-NEXT: vstrbt.8 q0, [r0] +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: bne .LBB16_9 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB16_10 +; CHECK-LE-NEXT: .LBB16_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB16_11 +; CHECK-LE-NEXT: .LBB16_3: @ %else4 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB16_12 +; CHECK-LE-NEXT: .LBB16_4: @ %else6 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB16_13 +; CHECK-LE-NEXT: .LBB16_5: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB16_14 +; CHECK-LE-NEXT: .LBB16_6: @ %else10 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB16_15 +; CHECK-LE-NEXT: .LBB16_7: @ %else12 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB16_16 +; CHECK-LE-NEXT: .LBB16_8: @ %else14 +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB16_9: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s0, [sp, #28] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #28] +; CHECK-LE-NEXT: strh r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB16_2 +; CHECK-LE-NEXT: .LBB16_10: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s4, s0 +; CHECK-LE-NEXT: vstr.16 s4, [sp, #24] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #24] +; CHECK-LE-NEXT: strh r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB16_3 +; CHECK-LE-NEXT: .LBB16_11: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s1, [sp, #20] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #20] +; CHECK-LE-NEXT: strh r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB16_4 +; CHECK-LE-NEXT: .LBB16_12: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s4, s1 +; CHECK-LE-NEXT: vstr.16 s4, [sp, #16] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #16] +; CHECK-LE-NEXT: strh r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB16_5 +; CHECK-LE-NEXT: .LBB16_13: @ %cond.store7 +; CHECK-LE-NEXT: vstr.16 s2, [sp, #12] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #12] +; CHECK-LE-NEXT: strh r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB16_6 +; CHECK-LE-NEXT: .LBB16_14: @ %cond.store9 +; CHECK-LE-NEXT: vmovx.f16 s4, s2 +; CHECK-LE-NEXT: vstr.16 s4, [sp, #8] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #8] +; CHECK-LE-NEXT: strh r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB16_7 +; CHECK-LE-NEXT: .LBB16_15: @ %cond.store11 +; CHECK-LE-NEXT: vstr.16 s3, [sp, #4] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #4] +; CHECK-LE-NEXT: strh r2, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB16_8 +; CHECK-LE-NEXT: .LBB16_16: @ %cond.store13 +; CHECK-LE-NEXT: vmovx.f16 s0, s3 +; CHECK-LE-NEXT: vstr.16 s0, [sp] +; CHECK-LE-NEXT: ldrh.w r1, [sp] +; CHECK-LE-NEXT: strh r1, [r0, #14] +; CHECK-LE-NEXT: add sp, #40 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vrev64.16 q2, q0 -; CHECK-BE-NEXT: vrev16.8 q0, q2 +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 ; CHECK-BE-NEXT: vrev64.16 q2, q1 -; CHECK-BE-NEXT: vpt.i16 ne, q2, zr -; CHECK-BE-NEXT: vstrbt.8 q0, [r0] +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: bne .LBB16_9 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB16_10 +; CHECK-BE-NEXT: .LBB16_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB16_11 +; CHECK-BE-NEXT: .LBB16_3: @ %else4 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB16_12 +; CHECK-BE-NEXT: .LBB16_4: @ %else6 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB16_13 +; CHECK-BE-NEXT: .LBB16_5: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB16_14 +; CHECK-BE-NEXT: .LBB16_6: @ %else10 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB16_15 +; CHECK-BE-NEXT: .LBB16_7: @ %else12 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB16_16 +; CHECK-BE-NEXT: .LBB16_8: @ %else14 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB16_9: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s4, [sp, #28] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #28] +; CHECK-BE-NEXT: strh r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB16_2 +; CHECK-BE-NEXT: .LBB16_10: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s0, s4 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #24] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #24] +; CHECK-BE-NEXT: strh r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB16_3 +; CHECK-BE-NEXT: .LBB16_11: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s5, [sp, #20] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #20] +; CHECK-BE-NEXT: strh r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB16_4 +; CHECK-BE-NEXT: .LBB16_12: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s5 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #16] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #16] +; CHECK-BE-NEXT: strh r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB16_5 +; CHECK-BE-NEXT: .LBB16_13: @ %cond.store7 +; CHECK-BE-NEXT: vstr.16 s6, [sp, #12] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #12] +; CHECK-BE-NEXT: strh r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB16_6 +; CHECK-BE-NEXT: .LBB16_14: @ %cond.store9 +; CHECK-BE-NEXT: vmovx.f16 s0, s6 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #8] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #8] +; CHECK-BE-NEXT: strh r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB16_7 +; CHECK-BE-NEXT: .LBB16_15: @ %cond.store11 +; CHECK-BE-NEXT: vstr.16 s7, [sp, #4] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #4] +; CHECK-BE-NEXT: strh r2, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB16_8 +; CHECK-BE-NEXT: .LBB16_16: @ %cond.store13 +; CHECK-BE-NEXT: vmovx.f16 s0, s7 +; CHECK-BE-NEXT: vstr.16 s0, [sp] +; CHECK-BE-NEXT: ldrh.w r1, [sp] +; CHECK-BE-NEXT: strh r1, [r0, #14] +; CHECK-BE-NEXT: add sp, #40 ; CHECK-BE-NEXT: bx lr entry: %c = icmp ugt <8 x i16> %b, zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll @@ -3,9 +3,9 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1-m.main-none-eabi" -; CHECK-LABEL: test -; CHECK: llvm.masked.store.v4i32.p0v4i32 -define void @test(i32* nocapture %A, i32 %n) #0 { +; CHECK-LABEL: test_i32_align4 +; CHECK: call void @llvm.masked.store.v4i32.p0v4i32 +define void @test_i32_align4(i32* nocapture %A, i32 %n) #0 { entry: %cmp12 = icmp sgt i32 %n, 0 br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup @@ -37,4 +37,140 @@ ret void } +; CHECK-LABEL: test_i32_align2 +; CHECK-NOT: call void @llvm.masked.store +define void @test_i32_align2(i32* nocapture %A, i32 %n) #0 { +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.013 + %0 = load i32, i32* %arrayidx, align 2 + %.off = add i32 %0, 9 + %1 = icmp ult i32 %.off, 19 + br i1 %1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + store i32 0, i32* %arrayidx, align 2 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i32 %i.013, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +; CHECK-LABEL: test_i32_noalign +; CHECK: call void @llvm.masked.store.v4i32.p0v4i32 +define void @test_i32_noalign(i32* nocapture %A, i32 %n) #0 { +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.013 + %0 = load i32, i32* %arrayidx + %.off = add i32 %0, 9 + %1 = icmp ult i32 %.off, 19 + br i1 %1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + store i32 0, i32* %arrayidx + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i32 %i.013, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +; CHECK-LABEL: test_i16_align2 +; CHECK: call void @llvm.masked.store.v8i16.p0v8i16 +define void @test_i16_align2(i16* nocapture %A, i32 %n) #0 { +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.013 + %0 = load i16, i16* %arrayidx, align 2 + %.off = add i16 %0, 9 + %1 = icmp ult i16 %.off, 19 + br i1 %1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + store i16 0, i16* %arrayidx, align 2 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i32 %i.013, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +; CHECK-LABEL: test_i16_align1 +; CHECK-NOT: call void @llvm.masked.store +define void @test_i16_align1(i16* nocapture %A, i32 %n) #0 { +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.013 + %0 = load i16, i16* %arrayidx, align 1 + %.off = add i16 %0, 9 + %1 = icmp ult i16 %.off, 19 + br i1 %1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + store i16 0, i16* %arrayidx, align 1 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i32 %i.013, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + attributes #0 = { "target-features"="+mve" }