diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13705,6 +13705,18 @@ const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); + SDLoc dl(N); + + if (Subtarget->hasMVEIntegerOps()) { + // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will + // need to come from a GPR. + if (Op.getValueType() == MVT::f32) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); + else if (Op.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); + } if (!Subtarget->hasNEON()) return SDValue(); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2107,10 +2107,10 @@ def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)), (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>; - def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))), - (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>; - def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))), - (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>; + def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP16 rGPR:$elem)>; + def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP32 rGPR:$elem)>; def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; @@ -2134,15 +2134,15 @@ (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), (v4i32 MQPR:$inactive))>; def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), - (v4f32 (ARMvdup (f32 SPR:$elem))), + (v4f32 (ARMvdup (i32 rGPR:$elem))), (v4f32 MQPR:$inactive))), - (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR)), - ARMVCCThen, (v4i1 VCCR:$pred), (v4f32 MQPR:$inactive))>; + (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), + (v4f32 MQPR:$inactive))>; def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), - (v8f16 (ARMvdup (f16 HPR:$elem))), + (v8f16 (ARMvdup (i32 rGPR:$elem))), (v8f16 MQPR:$inactive))), - (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR)), - ARMVCCThen, (v8i1 VCCR:$pred), (v8f16 MQPR:$inactive))>; + (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), + (v8f16 MQPR:$inactive))>; } @@ -4024,12 +4024,12 @@ def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)), (v4i1 (!cast("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; - def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)), - (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)), - (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)), - (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)), + (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))), (v16i1 (!cast("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; @@ -4038,12 +4038,12 @@ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))), (v4i1 (!cast("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))), - (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))), - (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))), - (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))), + (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } multiclass unpred_vcmpf_z { @@ -4059,25 +4059,25 @@ } multiclass unpred_vcmpf_r { - def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), - (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; - def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), - (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; - def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; - def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))), (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))), (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } let Predicates = [HasMVEInt] in { @@ -4788,25 +4788,21 @@ Instruction instr_f32> { let Predicates = [HasMVEFloat] in { // Unpredicated F16 - def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)))), - (v8f16 (instr_f16 (v8f16 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR))))>; + def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>; // Unpredicated F32 - def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)))), - (v4f32 (instr_f32 (v4f32 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR))))>; + def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>; // Predicated F16 - def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)), + def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), - (v8f16 (instr_f16 (v8f16 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR)), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val), ARMVCCThen, (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive)))>; // Preicated F32 - def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)), + def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), - (v4f32 (instr_f32 (v4f32 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR)), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val), ARMVCCThen, (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>; } @@ -5029,19 +5025,19 @@ defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; - def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; } class MVE_VBRSR size, list pattern=[]> @@ -5223,19 +5219,21 @@ defvar pred_int = int_arm_mve_fma_predicated; defvar v1 = (VTI.Vec MQPR:$v1); defvar v2 = (VTI.Vec MQPR:$v2); - defvar s = !if(VTI.Size{0}, (f16 HPR:$s), (f32 SPR:$s)); - defvar vs = (VTI.Vec (ARMvdup s)); - defvar is = (i32 (COPY_TO_REGCLASS s, rGPR)); + defvar vs = (VTI.Vec (ARMvdup (i32 rGPR:$s))); + defvar is = (i32 rGPR:$s); defvar pred = (VTI.Pred VCCR:$pred); let Predicates = [HasMVEFloat] in { if scalar_addend then { - def : Pat<(VTI.Vec (fma v1, v2, vs)), (VTI.Vec (Inst v1, v2, is))>; + def : Pat<(VTI.Vec (fma v1, v2, vs)), + (VTI.Vec (Inst v1, v2, is))>; def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)), (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>; } else { - def : Pat<(VTI.Vec (fma v1, vs, v2)), (VTI.Vec (Inst v2, v1, is))>; - def : Pat<(VTI.Vec (fma vs, v1, v2)), (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (fma v1, vs, v2)), + (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (fma vs, v1, v2)), + (VTI.Vec (Inst v2, v1, is))>; def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)), (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>; def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)), diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -56,7 +56,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -104,7 +104,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -152,7 +152,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -200,7 +200,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -248,7 +248,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -298,7 +298,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -353,7 +353,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -408,7 +408,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -462,7 +462,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -517,7 +517,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: vneg.f16 q0, q0 ; CHECK-NEXT: .LBB10_1: @ %vector.body @@ -573,7 +573,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -628,7 +628,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -683,7 +683,7 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -744,7 +744,7 @@ ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: mov r5, r12 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.f16 r4, s0 ; CHECK-NEXT: vdup.16 q0, r4 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB14_2: @ %vector.body @@ -833,18 +833,18 @@ ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r6, [r0, #8] ; CHECK-NEXT: vldr.16 s0, [r6] -; CHECK-NEXT: vmov lr, s0 +; CHECK-NEXT: vmov.f16 lr, s0 ; CHECK-NEXT: vldr.16 s0, [r6, #2] ; CHECK-NEXT: vdup.16 q3, lr ; CHECK-NEXT: lsr.w lr, r3, #2 -; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov.f16 r5, s0 ; CHECK-NEXT: vldr.16 s0, [r6, #4] ; CHECK-NEXT: vdup.16 q2, r5 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.f16 r4, s0 ; CHECK-NEXT: vldr.16 s0, [r6, #6] ; CHECK-NEXT: vdup.16 q1, r4 ; CHECK-NEXT: add.w r4, r12, r7, lsl #1 -; CHECK-NEXT: vmov r6, s0 +; CHECK-NEXT: vmov.f16 r6, s0 ; CHECK-NEXT: vdup.16 q0, r6 ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph @@ -1150,34 +1150,34 @@ ; CHECK-NEXT: vldr.16 s14, [r12, #4] ; CHECK-NEXT: vldr.16 s5, [r12, #2] ; CHECK-NEXT: vstrb.8 q0, [r4], #8 -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: adds r6, r5, #2 -; CHECK-NEXT: add.w r9, r5, #16 -; CHECK-NEXT: vmul.f16 q0, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: vmov.f16 r0, s7 ; CHECK-NEXT: vldrw.u32 q4, [r6] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r6, r5, #6 +; CHECK-NEXT: vmul.f16 q0, q0, r0 +; CHECK-NEXT: vmov.f16 r0, s5 ; CHECK-NEXT: vfma.f16 q0, q4, r0 ; CHECK-NEXT: vldrw.u32 q4, [r5, #4] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: cmp.w r11, #16 +; CHECK-NEXT: vmov.f16 r0, s14 +; CHECK-NEXT: adds r6, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q4, r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.f16 r0, s12 ; CHECK-NEXT: vldrw.u32 q3, [r6] ; CHECK-NEXT: add.w r6, r5, #10 +; CHECK-NEXT: add.w r9, r5, #16 +; CHECK-NEXT: cmp.w r11, #16 ; CHECK-NEXT: vfma.f16 q0, q3, r0 ; CHECK-NEXT: vldrw.u32 q3, [r5, #8] -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f16 r0, s10 ; CHECK-NEXT: vfma.f16 q0, q3, r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f16 r0, s8 ; CHECK-NEXT: vldrw.u32 q2, [r6] ; CHECK-NEXT: add.w r6, r5, #14 ; CHECK-NEXT: vfma.f16 q0, q2, r0 ; CHECK-NEXT: vldrw.u32 q2, [r5, #12] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.f16 r0, s6 ; CHECK-NEXT: vfma.f16 q0, q2, r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r6] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: blo .LBB16_8 @@ -1190,40 +1190,40 @@ ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldr.16 s4, [r6] ; CHECK-NEXT: add.w r5, r9, #2 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r9] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldr.16 s4, [r6, #2] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: add.w r5, r9, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldr.16 s4, [r6, #4] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r9, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldr.16 s4, [r6, #6] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: add.w r5, r9, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldr.16 s4, [r6, #8] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r9, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldr.16 s4, [r6, #10] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: add.w r5, r9, #14 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldr.16 s4, [r6, #12] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r9, #12] ; CHECK-NEXT: add.w r9, r9, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldr.16 s4, [r6, #14] ; CHECK-NEXT: adds r6, #16 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: le lr, .LBB16_6 @@ -1247,7 +1247,7 @@ ; CHECK-NEXT: subs r0, #1 ; CHECK-NEXT: adds r6, #2 ; CHECK-NEXT: cmp r0, #1 -; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.f16 r7, s4 ; CHECK-NEXT: vldrh.u16 q1, [r5], #2 ; CHECK-NEXT: vfma.f16 q0, q1, r7 ; CHECK-NEXT: bgt .LBB16_10 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -706,9 +706,8 @@ ; CHECK-NEXT: .LBB14_1: @ %for.body.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 -; CHECK-NEXT: vldr s0, [r1] +; CHECK-NEXT: ldr r4, [r1] ; CHECK-NEXT: mov r5, r12 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB14_2: @ %vector.body @@ -788,55 +787,49 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: ldrh.w r10, [r0] +; CHECK-NEXT: ldrh.w r9, [r0] ; CHECK-NEXT: ldr.w r12, [r0, #4] -; CHECK-NEXT: sub.w r7, r10, #1 +; CHECK-NEXT: sub.w r7, r9, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r6, [r0, #8] ; CHECK-NEXT: add.w r4, r12, r7, lsl #2 +; CHECK-NEXT: ldrd lr, r8, [r6] +; CHECK-NEXT: ldrd r5, r6, [r6, #8] +; CHECK-NEXT: vdup.32 q3, lr +; CHECK-NEXT: vdup.32 q2, r8 +; CHECK-NEXT: vdup.32 q0, r6 +; CHECK-NEXT: vdup.32 q1, r5 ; CHECK-NEXT: lsr.w lr, r3, #2 -; CHECK-NEXT: vldr s0, [r6, #12] -; CHECK-NEXT: vldr s4, [r6, #8] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vldr s8, [r6, #4] -; CHECK-NEXT: vdup.32 q0, r7 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: vldr s12, [r6] -; CHECK-NEXT: vdup.32 q1, r7 -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vdup.32 q2, r7 -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: vdup.32 q3, r7 ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: bic r9, r3, #3 +; CHECK-NEXT: bic r10, r3, #3 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: add.w r8, r2, r9, lsl #2 +; CHECK-NEXT: add.w r8, r2, r10, lsl #2 ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r5, r1, r6 -; CHECK-NEXT: adds r7, r2, r6 -; CHECK-NEXT: vldrw.u32 q4, [r5] -; CHECK-NEXT: adds r5, r4, r6 -; CHECK-NEXT: vstrw.32 q4, [r5] -; CHECK-NEXT: add.w r5, r12, r6 -; CHECK-NEXT: vldrw.u32 q4, [r5] -; CHECK-NEXT: vldrw.u32 q5, [r5, #4] -; CHECK-NEXT: vldrw.u32 q6, [r5, #12] +; CHECK-NEXT: adds r7, r1, r6 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: vldrw.u32 q4, [r7] +; CHECK-NEXT: adds r7, r4, r6 +; CHECK-NEXT: vstrw.32 q4, [r7] +; CHECK-NEXT: add.w r7, r12, r6 +; CHECK-NEXT: vldrw.u32 q4, [r7] +; CHECK-NEXT: vldrw.u32 q5, [r7, #4] +; CHECK-NEXT: vldrw.u32 q6, [r7, #12] ; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: vmul.f32 q4, q4, q3 ; CHECK-NEXT: vfma.f32 q4, q5, q2 -; CHECK-NEXT: vldrw.u32 q5, [r5, #8] +; CHECK-NEXT: vldrw.u32 q5, [r7, #8] ; CHECK-NEXT: vfma.f32 q4, q5, q1 ; CHECK-NEXT: vfma.f32 q4, q6, q0 -; CHECK-NEXT: vstrw.32 q4, [r7] +; CHECK-NEXT: vstrw.32 q4, [r5] ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit ; CHECK-NEXT: add r4, r6 -; CHECK-NEXT: add.w r12, r12, r9, lsl #2 -; CHECK-NEXT: add.w r1, r1, r9, lsl #2 +; CHECK-NEXT: add.w r12, r12, r10, lsl #2 +; CHECK-NEXT: add.w r1, r1, r10, lsl #2 ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: .LBB15_5: @ %while.end ; CHECK-NEXT: and r7, r3, #3 @@ -857,10 +850,10 @@ ; CHECK-NEXT: ldr.w r12, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end ; CHECK-NEXT: add.w r0, r12, r3, lsl #2 -; CHECK-NEXT: lsr.w lr, r10, #2 +; CHECK-NEXT: lsr.w lr, r9, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r10, #3 +; CHECK-NEXT: bic r2, r9, #3 ; CHECK-NEXT: adds r1, r2, r3 ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: add.w r1, r12, r1, lsl #2 @@ -873,7 +866,7 @@ ; CHECK-NEXT: add.w r12, r12, r2, lsl #2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r10, #3 +; CHECK-NEXT: ands r1, r9, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] @@ -1056,10 +1049,10 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: cmp r3, #8 ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %if.then @@ -1068,161 +1061,130 @@ ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph ; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: lsr.w r8, r3, #2 -; CHECK-NEXT: ldrd r5, r12, [r0, #4] -; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: ldrd r6, r12, [r0, #4] +; CHECK-NEXT: lsrs r3, r3, #2 ; CHECK-NEXT: sub.w r0, r4, #8 -; CHECK-NEXT: and r10, r0, #7 ; CHECK-NEXT: add.w r7, r0, r0, lsr #29 -; CHECK-NEXT: add.w r0, r10, #1 -; CHECK-NEXT: asrs r6, r7, #3 -; CHECK-NEXT: cmp r6, #1 +; CHECK-NEXT: and r0, r0, #7 +; CHECK-NEXT: asr.w lr, r7, #3 +; CHECK-NEXT: cmp.w lr, #1 ; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r3, r7, #3 -; CHECK-NEXT: add.w r7, r5, r4, lsl #2 +; CHECK-NEXT: asrgt r5, r7, #3 +; CHECK-NEXT: add.w r7, r6, r4, lsl #2 ; CHECK-NEXT: sub.w r11, r7, #4 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: rsbs r3, r4, #0 -; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r12, #32 -; CHECK-NEXT: str r4, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: rsbs r7, r4, #0 +; CHECK-NEXT: str r7, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: add.w r7, r12, #32 +; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_3: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: subs.w r8, r8, #1 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldrd r11, r3, [sp, #28] @ 8-byte Folded Reload ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: add.w r0, r9, r0, lsl #2 -; CHECK-NEXT: add.w r5, r0, #16 -; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: subs r3, #1 +; CHECK-NEXT: add.w r0, r8, r0, lsl #2 +; CHECK-NEXT: add.w r6, r0, #16 +; CHECK-NEXT: beq .LBB16_12 ; CHECK-NEXT: .LBB16_4: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 -; CHECK-NEXT: vldr s2, [r12, #12] -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vldr s8, [r12, #28] -; CHECK-NEXT: add.w r9, r5, #32 -; CHECK-NEXT: vldr s0, [r12] -; CHECK-NEXT: vstr s2, [sp, #64] @ 4-byte Spill -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: vldr s2, [r12, #16] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vldr s4, [r12, #20] -; CHECK-NEXT: vldr s6, [r12, #24] -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vldr s5, [r12, #4] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s7, [r12, #8] -; CHECK-NEXT: vstrb.8 q3, [r11], #16 -; CHECK-NEXT: vldrw.u32 q2, [r5, #28] -; CHECK-NEXT: vldrw.u32 q4, [r5] -; CHECK-NEXT: vldrw.u32 q5, [r5, #4] -; CHECK-NEXT: vldrw.u32 q3, [r5, #20] -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r5, #24] -; CHECK-NEXT: vldrw.u32 q6, [r5, #12] -; CHECK-NEXT: vldrw.u32 q7, [r5, #16] -; CHECK-NEXT: vmul.f32 q0, q4, r3 -; CHECK-NEXT: vldrw.u32 q4, [r5, #8] -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vfma.f32 q0, q5, r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vfma.f32 q0, q4, r3 -; CHECK-NEXT: vldr s4, [sp, #64] @ 4-byte Reload -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vfma.f32 q0, q6, r3 -; CHECK-NEXT: vfma.f32 q0, q7, r4 -; CHECK-NEXT: vfma.f32 q0, q3, r0 -; CHECK-NEXT: vfma.f32 q0, q2, r7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vfma.f32 q0, q1, r6 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: add.w lr, r12, #12 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: ldm.w r12, {r0, r5, r7} +; CHECK-NEXT: ldm.w lr, {r4, r9, lr} +; CHECK-NEXT: ldrd r8, r10, [r12, #24] +; CHECK-NEXT: vstrb.8 q0, [r11], #16 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vldrw.u32 q1, [r6, #4] +; CHECK-NEXT: vldrw.u32 q6, [r6, #8] +; CHECK-NEXT: vldrw.u32 q4, [r6, #12] +; CHECK-NEXT: vmul.f32 q0, q0, r0 +; CHECK-NEXT: vldrw.u32 q5, [r6, #16] +; CHECK-NEXT: vfma.f32 q0, q1, r5 +; CHECK-NEXT: vldrw.u32 q2, [r6, #20] +; CHECK-NEXT: vfma.f32 q0, q6, r7 +; CHECK-NEXT: vldrw.u32 q3, [r6, #24] +; CHECK-NEXT: vfma.f32 q0, q4, r4 +; CHECK-NEXT: vldrw.u32 q1, [r6, #28] +; CHECK-NEXT: vfma.f32 q0, q5, r9 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vfma.f32 q0, q2, lr +; CHECK-NEXT: add.w r5, r6, #32 +; CHECK-NEXT: vfma.f32 q0, q3, r8 ; CHECK-NEXT: cmp r0, #16 -; CHECK-NEXT: blo .LBB16_8 +; CHECK-NEXT: vfma.f32 q0, q1, r10 +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: strd r11, r3, [sp, #28] @ 8-byte Folded Spill +; CHECK-NEXT: blo .LBB16_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr.w lr, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: .LBB16_6: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q1, [r9, #28] -; CHECK-NEXT: vldr s24, [r6] -; CHECK-NEXT: vldr s26, [r6, #4] -; CHECK-NEXT: vldrw.u32 q3, [r9, #4] -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r9, #20] -; CHECK-NEXT: vldr s28, [r6, #8] -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r9, #24] -; CHECK-NEXT: vldr s25, [r6, #16] -; CHECK-NEXT: vldrw.u32 q5, [r9, #12] -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r9] -; CHECK-NEXT: vldr s27, [r6, #20] -; CHECK-NEXT: vldrw.u32 q4, [r9, #16] -; CHECK-NEXT: vldr s29, [r6, #24] -; CHECK-NEXT: vldrw.u32 q2, [r9, #8] -; CHECK-NEXT: vldr s31, [r6, #28] -; CHECK-NEXT: vmov r5, s25 -; CHECK-NEXT: vldr s30, [r6, #12] -; CHECK-NEXT: vfma.f32 q0, q1, r7 -; CHECK-NEXT: vmov r7, s26 -; CHECK-NEXT: add.w r9, r9, #32 -; CHECK-NEXT: vfma.f32 q0, q3, r7 -; CHECK-NEXT: vmov r7, s28 -; CHECK-NEXT: vfma.f32 q0, q2, r7 -; CHECK-NEXT: vmov r7, s30 -; CHECK-NEXT: vfma.f32 q0, q5, r7 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vfma.f32 q0, q4, r5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s29 -; CHECK-NEXT: adds r6, #32 -; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vfma.f32 q0, q1, r4 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: ldm.w r6, {r0, r3, r4, r7, r10, r11} +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: vldrw.u32 q6, [r5, #8] +; CHECK-NEXT: vldrw.u32 q4, [r5, #12] +; CHECK-NEXT: vldrw.u32 q5, [r5, #16] +; CHECK-NEXT: vldrw.u32 q2, [r5, #20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [r5, #4] +; CHECK-NEXT: ldrd r1, r9, [r6, #24] +; CHECK-NEXT: vldrw.u32 q3, [r5, #24] +; CHECK-NEXT: vfma.f32 q0, q1, r3 +; CHECK-NEXT: vldrw.u32 q1, [r5, #28] +; CHECK-NEXT: vfma.f32 q0, q6, r4 +; CHECK-NEXT: add.w r8, r5, #32 +; CHECK-NEXT: vfma.f32 q0, q4, r7 +; CHECK-NEXT: adds r6, #32 +; CHECK-NEXT: vfma.f32 q0, q5, r10 +; CHECK-NEXT: vfma.f32 q0, q2, r11 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: vfma.f32 q0, q3, r1 +; CHECK-NEXT: vfma.f32 q0, q1, r9 ; CHECK-NEXT: le lr, .LBB16_6 -; CHECK-NEXT: @ %bb.7: @ %for.end +; CHECK-NEXT: b .LBB16_8 +; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: .LBB16_8: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: bne .LBB16_9 -; CHECK-NEXT: b .LBB16_3 -; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: beq.w .LBB16_3 -; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: beq .LBB16_3 +; CHECK-NEXT: @ %bb.9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldr s4, [r6] -; CHECK-NEXT: vldrw.u32 q2, [r5], #4 -; CHECK-NEXT: subs r0, #1 -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: cmp r0, #1 -; CHECK-NEXT: vfma.f32 q0, q2, r3 +; CHECK-NEXT: ldr r1, [r6], #4 +; CHECK-NEXT: vldrw.u32 q1, [r0], #4 +; CHECK-NEXT: subs r5, #1 +; CHECK-NEXT: vfma.f32 q0, q1, r1 +; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: bgt .LBB16_10 ; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: add.w r9, r9, r10, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: add.w r8, r8, r0, lsl #2 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #88 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll --- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll @@ -199,7 +199,7 @@ ; CHECK-MVE-FP-LABEL: vfmar16: ; CHECK-MVE-FP: @ %bb.0: @ %entry ; CHECK-MVE-FP-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-MVE-FP-NEXT: vmov r0, s8 +; CHECK-MVE-FP-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-FP-NEXT: vmul.f16 q1, q1, r0 ; CHECK-MVE-FP-NEXT: vadd.f16 q0, q0, q1 ; CHECK-MVE-FP-NEXT: bx lr @@ -207,7 +207,7 @@ ; CHECK-MVE-VMLA-LABEL: vfmar16: ; CHECK-MVE-VMLA: @ %bb.0: @ %entry ; CHECK-MVE-VMLA-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-MVE-VMLA-NEXT: vmov r0, s8 +; CHECK-MVE-VMLA-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-VMLA-NEXT: vfma.f16 q0, q1, r0 ; CHECK-MVE-VMLA-NEXT: bx lr ; @@ -266,14 +266,14 @@ ; CHECK-MVE-FP: @ %bb.0: @ %entry ; CHECK-MVE-FP-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-MVE-FP-NEXT: vmul.f16 q0, q0, q1 -; CHECK-MVE-FP-NEXT: vmov r0, s8 +; CHECK-MVE-FP-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-FP-NEXT: vadd.f16 q0, q0, r0 ; CHECK-MVE-FP-NEXT: bx lr ; ; CHECK-MVE-VMLA-LABEL: vfma16: ; CHECK-MVE-VMLA: @ %bb.0: @ %entry ; CHECK-MVE-VMLA-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-MVE-VMLA-NEXT: vmov r0, s8 +; CHECK-MVE-VMLA-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-VMLA-NEXT: vfmas.f16 q0, q1, r0 ; CHECK-MVE-VMLA-NEXT: bx lr ; @@ -437,8 +437,8 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32(<4 x float> %src1, <4 x float> %src2, float %src3) { ; CHECK-MVE-FP-LABEL: vfmas32: ; CHECK-MVE-FP: @ %bb.0: @ %entry -; CHECK-MVE-FP-NEXT: vmul.f32 q0, q0, q1 ; CHECK-MVE-FP-NEXT: vmov r0, s8 +; CHECK-MVE-FP-NEXT: vmul.f32 q0, q0, q1 ; CHECK-MVE-FP-NEXT: vadd.f32 q0, q0, r0 ; CHECK-MVE-FP-NEXT: bx lr ; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll @@ -4,7 +4,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vdupq_n_f16(float %a.coerce) { ; CHECK-LABEL: test_vdupq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f16 r0, s0 ; CHECK-NEXT: vdup.16 q0, r0 ; CHECK-NEXT: bx lr entry: @@ -97,7 +97,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vdupq_m_n_f16(<8 x half> %inactive, float %a.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vdupq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vdupt.16 q0, r1 @@ -117,10 +117,10 @@ define arm_aapcs_vfpcc <4 x float> @test_vdupq_m_n_f32(<4 x float> %inactive, float %a, i16 zeroext %p) { ; CHECK-LABEL: test_vdupq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.32 q0, r0 +; CHECK-NEXT: vdupt.32 q0, r1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll @@ -24,7 +24,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { ; CHECK-LABEL: test_vfmaq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f16 r0, s8 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: @@ -53,7 +53,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmasq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { ; CHECK-LABEL: test_vfmasq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f16 r0, s8 ; CHECK-NEXT: vfmas.f16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: @@ -390,7 +390,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vfmaq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.f16 r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f16 q0, q1, r1 @@ -410,10 +410,10 @@ define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) { ; CHECK-LABEL: test_vfmaq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q0, q1, r0 +; CHECK-NEXT: vfmat.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 @@ -427,7 +427,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmasq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vfmasq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.f16 r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmast.f16 q0, q1, r1 @@ -447,10 +447,10 @@ define arm_aapcs_vfpcc <4 x float> @test_vfmasq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) { ; CHECK-LABEL: test_vfmasq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vfmast.f32 q0, q1, r0 +; CHECK-NEXT: vfmast.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll @@ -106,7 +106,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vaddq_n_f16(<8 x half> %a, float %b.coerce) { ; CHECK-LABEL: test_vaddq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vadd.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: @@ -138,10 +138,10 @@ define arm_aapcs_vfpcc <4 x float> @test_vaddq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) { ; CHECK-LABEL: test_vaddq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f32 q0, q1, r0 +; CHECK-NEXT: vaddt.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 @@ -171,7 +171,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vaddq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vaddq_x_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vaddt.f16 q0, q0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll @@ -269,7 +269,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vmulq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.f16 r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vmult.f16 q0, q1, r1 @@ -337,10 +337,10 @@ define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) { ; CHECK-LABEL: test_vmulq_x_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f32 q0, q0, r0 +; CHECK-NEXT: vmult.f32 q0, q0, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll @@ -106,7 +106,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vsubq_n_f16(<8 x half> %a, float %b.coerce) { ; CHECK-LABEL: test_vsubq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vsub.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: @@ -138,10 +138,10 @@ define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) { ; CHECK-LABEL: test_vsubq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.f32 q0, q1, r0 +; CHECK-NEXT: vsubt.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 @@ -171,7 +171,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vsubq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vsubq_x_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.f16 q0, q0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -183,7 +183,6 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: vneg.f32 s4, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 @@ -192,7 +191,7 @@ ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: eor r1, r1, #-2147483648 ; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -250,10 +249,10 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vneg.f16 s0, s0 ; CHECK-NEXT: vdup.16 q1, r1 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB4_1: @ %vector.body @@ -486,7 +485,6 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: vneg.f32 s4, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 @@ -495,7 +493,7 @@ ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: eor r1, r1, #-2147483648 ; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -553,10 +551,10 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vneg.f16 s0, s0 ; CHECK-NEXT: vdup.16 q1, r1 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB9_1: @ %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll @@ -917,7 +917,7 @@ ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 eq, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1059,7 +1059,7 @@ ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -1187,7 +1187,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1313,7 +1313,7 @@ ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1439,7 +1439,7 @@ ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1565,7 +1565,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1707,7 +1707,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -1834,7 +1834,7 @@ ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 ne, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1960,7 +1960,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2087,7 +2087,7 @@ ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2214,7 +2214,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2341,7 +2341,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2468,7 +2468,7 @@ ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -2596,7 +2596,7 @@ ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -3528,7 +3528,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 eq, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -3670,7 +3670,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -3798,7 +3798,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -3924,7 +3924,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4050,7 +4050,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4176,7 +4176,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4318,7 +4318,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4445,7 +4445,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 ne, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4571,7 +4571,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4698,7 +4698,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4825,7 +4825,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4952,7 +4952,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -5079,7 +5079,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -5207,7 +5207,7 @@ ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry ; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.f16 r0, s12 ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -82,7 +82,7 @@ ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vadd.f16 s0, s2, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f16 r0, s0 ; CHECK-NEXT: vdup.16 q0, r0 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -21,7 +21,7 @@ ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r2, lsr #3 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: dls lr, lr