Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.h
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h
@@ -794,6 +794,8 @@
 
     bool shouldConsiderGEPOffsetSplit() const override { return true; }
 
+    bool isUnsupportedFloatingType(EVT VT) const;
+
     SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
                     SDValue ARMcc, SDValue CCR, SDValue Cmp,
                     SelectionDAG &DAG) const;
Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
@@ -224,6 +224,13 @@
 void ARMTargetLowering::setAllExpand(MVT VT) {
   for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
     setOperationAction(Opc, VT, Expand);
+
+  // We support these really simple operations even on types where all
+  // the actual arithmetic has to be broken down into simpler
+  // operations or turned into library calls.
+  setOperationAction(ISD::BITCAST, VT, Legal);
+  setOperationAction(ISD::LOAD, VT, Legal);
+  setOperationAction(ISD::STORE, VT, Legal);
 }
 
 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
@@ -262,9 +269,6 @@
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
-    setOperationAction(ISD::BITCAST, VT, Legal);
-    setOperationAction(ISD::LOAD, VT, Legal);
-    setOperationAction(ISD::STORE, VT, Legal);
 
     if (HasMVEFP) {
       // No native support for these.
@@ -289,9 +293,6 @@
   for (auto VT : LongTypes) {
     addRegisterClass(VT, &ARM::QPRRegClass);
     setAllExpand(VT);
-    setOperationAction(ISD::BITCAST, VT, Legal);
-    setOperationAction(ISD::LOAD, VT, Legal);
-    setOperationAction(ISD::STORE, VT, Legal);
   }
 
   // It is legal to extload from v4i8 to v4i16 or v4i32.
@@ -594,10 +595,14 @@
   else
     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
 
-  if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
-      !Subtarget->isThumb1Only()) {
+  if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
+      Subtarget->hasFPRegs()) {
     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+    if (!Subtarget->hasVFP2Base())
+      setAllExpand(MVT::f32);
+    if (!Subtarget->hasFP64())
+      setAllExpand(MVT::f64);
   }
 
   if (Subtarget->hasFullFP16()) {
@@ -4544,6 +4549,16 @@
   return false;
 }
 
+bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
+  if (VT == MVT::f32)
+    return !Subtarget->hasVFP2Base();
+  if (VT == MVT::f64)
+    return !Subtarget->hasFP64();
+  if (VT == MVT::f16)
+    return !Subtarget->hasFullFP16();
+  return false;
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
@@ -4587,9 +4602,9 @@
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
 
-  if (!Subtarget->hasFP64() && LHS.getValueType() == MVT::f64) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
-                                                    dl);
+  if (isUnsupportedFloatingType(LHS.getValueType())) {
+    DAG.getTargetLoweringInfo().softenSetCCOperands(
+        DAG, LHS.getValueType(), LHS, RHS, CC, dl);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -4828,9 +4843,9 @@
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
-  if (!Subtarget->hasFP64() && LHS.getValueType() == MVT::f64) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
-                                                    dl);
+  if (isUnsupportedFloatingType(LHS.getValueType())) {
+    DAG.getTargetLoweringInfo().softenSetCCOperands(
+        DAG, LHS.getValueType(), LHS, RHS, CC, dl);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -4975,7 +4990,7 @@
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
-  if (!Subtarget->hasFP64() && Op.getOperand(0).getValueType() == MVT::f64) {
+  if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
     RTLIB::Libcall LC;
     if (Op.getOpcode() == ISD::FP_TO_SINT)
       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
@@ -5039,7 +5054,7 @@
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
-  if (!Subtarget->hasFP64() && Op.getValueType() == MVT::f64) {
+  if (isUnsupportedFloatingType(VT)) {
     RTLIB::Libcall LC;
     if (Op.getOpcode() == ISD::SINT_TO_FP)
       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
Index: llvm/trunk/lib/Target/ARM/ARMInstrVFP.td
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrVFP.td
+++ llvm/trunk/lib/Target/ARM/ARMInstrVFP.td
@@ -2269,13 +2269,13 @@
                     IIC_fpUNA64,
                     [(set (f64 DPR:$Dd),
                           (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
-               RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+               RegConstraint<"$Dn = $Dd">, Requires<[HasFPRegs64]>;
 
 def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
                     IIC_fpUNA32,
                     [(set (f32 SPR:$Sd),
                           (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
-               RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
+               RegConstraint<"$Sn = $Sd">, Requires<[HasFPRegs]>;
 } // hasSideEffects
 
 //===----------------------------------------------------------------------===//
Index: llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll
+++ llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll
@@ -1,6 +1,8 @@
 ; SOFT:
 ; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft     | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
 ; RUN: llc < %s -mtriple=thumb-none-eabi -float-abi=soft   | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=thumbv8.1m.main-none-eabi -float-abi=soft -mattr=+mve | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
 
 ; SOFTFP:
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3        | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-VFP3
@@ -206,8 +208,8 @@
 
 ; CHECK-LABEL:            VCMPBRCC:
 
-; CHECK-SOFT:             bl  __aeabi_fcmpgt
-; CHECK-SOFT:             cmp r0, #0
+; CHECK-SOFT:             bl  __aeabi_fcmp{{gt|le}}
+; CHECK-SOFT:             cmp r0, #{{0|1}}
 
 ; CHECK-SOFTFP-FP16:      vcvtb.f32.f16 [[S2:s[0-9]]], [[S2]]
 ; CHECK-SOFTFP-FP16:      vcmpe.f32 [[S2]], s0
Index: llvm/trunk/test/CodeGen/Thumb2/float-ops.ll
===================================================================
--- llvm/trunk/test/CodeGen/Thumb2/float-ops.ll
+++ llvm/trunk/test/CodeGen/Thumb2/float-ops.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=NONE -check-prefix=NOREGS
 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VFP4-ALL
 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=FP-ARMv8
 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP4-ALL -check-prefix=VFP4-DP
+; RUN: llc < %s -mtriple=thumbv8.1m.main-none-eabihf -mattr=+mve | FileCheck %s -check-prefix=CHECK -check-prefix=NONE -check-prefix=ONLYREGS
 
 define float @add_f(float %a, float %b) {
 entry:
 ; CHECK-LABEL: add_f:
-; NONE: bl __aeabi_fadd
+; NONE: {{b|bl}} __aeabi_fadd
 ; HARD: vadd.f32  s0, s0, s1
   %0 = fadd float %a, %b
   ret float %0
@@ -15,8 +16,8 @@
 define double @add_d(double %a, double %b) {
 entry:
 ; CHECK-LABEL: add_d:
-; NONE: bl __aeabi_dadd
-; SP: bl __aeabi_dadd
+; NONE: {{b|bl}} __aeabi_dadd
+; SP: {{b|bl}} __aeabi_dadd
 ; DP: vadd.f64  d0, d0, d1
   %0 = fadd double %a, %b
   ret double %0
@@ -25,7 +26,7 @@
 define float @sub_f(float %a, float %b) {
 entry:
 ; CHECK-LABEL: sub_f:
-; NONE: bl __aeabi_fsub
+; NONE: {{b|bl}} __aeabi_fsub
 ; HARD: vsub.f32  s
   %0 = fsub float %a, %b
   ret float %0
@@ -34,8 +35,8 @@
 define double @sub_d(double %a, double %b) {
 entry:
 ; CHECK-LABEL: sub_d:
-; NONE: bl __aeabi_dsub
-; SP: bl __aeabi_dsub
+; NONE: {{b|bl}} __aeabi_dsub
+; SP: {{b|bl}} __aeabi_dsub
 ; DP: vsub.f64  d0, d0, d1
   %0 = fsub double %a, %b
   ret double %0
@@ -44,7 +45,7 @@
 define float @mul_f(float %a, float %b) {
 entry:
 ; CHECK-LABEL: mul_f:
-; NONE: bl __aeabi_fmul
+; NONE: {{b|bl}} __aeabi_fmul
 ; HARD: vmul.f32  s
   %0 = fmul float %a, %b
   ret float %0
@@ -53,8 +54,8 @@
 define double @mul_d(double %a, double %b) {
 entry:
 ; CHECK-LABEL: mul_d:
-; NONE: bl __aeabi_dmul
-; SP: bl __aeabi_dmul
+; NONE: {{b|bl}} __aeabi_dmul
+; SP: {{b|bl}} __aeabi_dmul
 ; DP: vmul.f64  d0, d0, d1
   %0 = fmul double %a, %b
   ret double %0
@@ -63,7 +64,7 @@
 define float @div_f(float %a, float %b) {
 entry:
 ; CHECK-LABEL: div_f:
-; NONE: bl __aeabi_fdiv
+; NONE: {{b|bl}} __aeabi_fdiv
 ; HARD: vdiv.f32  s
   %0 = fdiv float %a, %b
   ret float %0
@@ -72,8 +73,8 @@
 define double @div_d(double %a, double %b) {
 entry:
 ; CHECK-LABEL: div_d:
-; NONE: bl __aeabi_ddiv
-; SP: bl __aeabi_ddiv
+; NONE: {{b|bl}} __aeabi_ddiv
+; SP: {{b|bl}} __aeabi_ddiv
 ; DP: vdiv.f64  d0, d0, d1
   %0 = fdiv double %a, %b
   ret double %0
@@ -109,7 +110,8 @@
 define double @load_d(double* %a) {
 entry:
 ; CHECK-LABEL: load_d:
-; NONE: ldm r0, {r0, r1}
+; NOREGS: ldm r0, {r0, r1}
+; ONLYREGS: vldr d0, [r0]
 ; HARD: vldr d0, [r0]
   %0 = load double, double* %a, align 8
   ret double %0
@@ -127,7 +129,8 @@
 define void @store_d(double* %a, double %b) {
 entry:
 ; CHECK-LABEL: store_d:
-; NONE: strd r2, r3, [r0]
+; NOREGS: strd r2, r3, [r0]
+; ONLYREGS: vstr d0, [r0]
 ; HARD: vstr d0, [r0]
   store double %b, double* %a, align 8
   ret void
@@ -259,8 +262,10 @@
 
 define float @select_f(float %a, float %b, i1 %c) {
 ; CHECK-LABEL: select_f:
-; NONE: lsls    r2, r2, #31
-; NONE: moveq   r0, r1
+; NOREGS: lsls    r2, r2, #31
+; NOREGS: moveq   r0, r1
+; ONLYREGS: lsls    r2, r2, #31
+; ONLYREGS: vmovne.f32      s2, s0
 ; HARD: lsls    r0, r0, #31
 ; VFP4-ALL: vmovne.f32      s1, s0
 ; VFP4-ALL: vmov.f32        s0, s1
@@ -273,8 +278,8 @@
 ; CHECK-LABEL: select_d:
 ; NONE: ldr{{(.w)?}}     [[REG:r[0-9]+]], [sp]
 ; NONE  ands    [[REG]], [[REG]], #1
-; NONE: moveq   r0, r2
-; NONE: moveq   r1, r3
+; NONE-DAG: moveq   r0, r2
+; NONE-DAG: moveq   r1, r3
 ; SP: ands r0, r0, #1
 ; SP-DAG: vmov [[ALO:r[0-9]+]], [[AHI:r[0-9]+]], d0
 ; SP-DAG: vmov [[BLO:r[0-9]+]], [[BHI:r[0-9]+]], d1