diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3125,20 +3125,6 @@
   return R;
 }
 
-// Emit code to clear the padding bits when returning or passing as an argument
-// a 16-bit floating-point value.
-llvm::Value *CodeGenFunction::EmitCMSEClearFP16(llvm::Value *Src) {
-  llvm::Type *RetTy = Src->getType();
-  assert(RetTy->isFloatTy() ||
-         (RetTy->isIntegerTy() && RetTy->getIntegerBitWidth() == 32));
-  if (RetTy->isFloatTy()) {
-    llvm::Value *T0 = Builder.CreateBitCast(Src, Builder.getIntNTy(32));
-    llvm::Value *T1 = Builder.CreateAnd(T0, 0xffff, "cmse.clear");
-    return Builder.CreateBitCast(T1, RetTy);
-  }
-  return Builder.CreateAnd(Src, 0xffff, "cmse.clear");
-}
-
 void CodeGenFunction::EmitFunctionEpilog(const CGFunctionInfo &FI,
                                          bool EmitRetDbgLoc,
                                          SourceLocation EndLoc) {
@@ -3308,17 +3294,10 @@
     if (CurFuncDecl && CurFuncDecl->hasAttr<CmseNSEntryAttr>()) {
       // For certain return types, clear padding bits, as they may reveal
       // sensitive information.
-      const Type *RTy = RetTy.getCanonicalType().getTypePtr();
-      if (RTy->isFloat16Type() || RTy->isHalfType()) {
-        // 16-bit floating-point types are passed in a 32-bit integer or float,
-        // with unspecified upper bits.
-        RV = EmitCMSEClearFP16(RV);
-      } else {
-        // Small struct/union types are passed as integers.
-        auto *ITy = dyn_cast<llvm::IntegerType>(RV->getType());
-        if (ITy != nullptr && isa<RecordType>(RetTy.getCanonicalType()))
-          RV = EmitCMSEClearRecord(RV, ITy, RetTy);
-      }
+      // Small struct/union types are passed as integers.
+      auto *ITy = dyn_cast<llvm::IntegerType>(RV->getType());
+      if (ITy != nullptr && isa<RecordType>(RetTy.getCanonicalType()))
+        RV = EmitCMSEClearRecord(RV, ITy, RetTy);
     }
     EmitReturnValueCheck(RV);
     Ret = Builder.CreateRet(RV);
@@ -4621,17 +4600,10 @@
         if (CallInfo.isCmseNSCall()) {
           // For certain parameter types, clear padding bits, as they may reveal
           // sensitive information.
-          const Type *PTy = I->Ty.getCanonicalType().getTypePtr();
-          // 16-bit floating-point types are passed in a 32-bit integer or
-          // float, with unspecified upper bits.
-          if (PTy->isFloat16Type() || PTy->isHalfType()) {
-            Load = EmitCMSEClearFP16(Load);
-          } else {
-            // Small struct/union types are passed as integer arrays.
-            auto *ATy = dyn_cast<llvm::ArrayType>(Load->getType());
-            if (ATy != nullptr && isa<RecordType>(I->Ty.getCanonicalType()))
-              Load = EmitCMSEClearRecord(Load, ATy, I->Ty);
-          }
+          // Small struct/union types are passed as integer arrays.
+          auto *ATy = dyn_cast<llvm::ArrayType>(Load->getType());
+          if (ATy != nullptr && isa<RecordType>(I->Ty.getCanonicalType()))
+            Load = EmitCMSEClearRecord(Load, ATy, I->Ty);
         }
         IRCallArgs[FirstIRArg] = Load;
       }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3878,7 +3878,6 @@
                                    QualType RTy);
   llvm::Value *EmitCMSEClearRecord(llvm::Value *V, llvm::ArrayType *ATy,
                                    QualType RTy);
-  llvm::Value *EmitCMSEClearFP16(llvm::Value *V);
 
   llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID,
                                          unsigned LLVMIntrinsic,
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -6254,17 +6254,6 @@
   if (isIllegalVectorType(Ty))
     return coerceIllegalVector(Ty);
 
-  // _Float16 and __fp16 get passed as if it were an int or float, but with
-  // the top 16 bits unspecified. This is not done for OpenCL as it handles the
-  // half type natively, and does not need to interwork with AAPCS code.
-  if ((Ty->isFloat16Type() || Ty->isHalfType()) &&
-      !getContext().getLangOpts().NativeHalfArgsAndReturns) {
-    llvm::Type *ResType = IsAAPCS_VFP ?
-      llvm::Type::getFloatTy(getVMContext()) :
-      llvm::Type::getInt32Ty(getVMContext());
-    return ABIArgInfo::getDirect(ResType);
-  }
-
   if (!isAggregateTypeForABI(Ty)) {
     // Treat an enum type as its underlying type.
     if (const EnumType *EnumTy = Ty->getAs<EnumType>()) {
@@ -6465,17 +6454,6 @@
       return coerceIllegalVector(RetTy);
   }
 
-  // _Float16 and __fp16 get returned as if it were an int or float, but with
-  // the top 16 bits unspecified. This is not done for OpenCL as it handles the
-  // half type natively, and does not need to interwork with AAPCS code.
-  if ((RetTy->isFloat16Type() || RetTy->isHalfType()) &&
-      !getContext().getLangOpts().NativeHalfArgsAndReturns) {
-    llvm::Type *ResType = IsAAPCS_VFP ?
-      llvm::Type::getFloatTy(getVMContext()) :
-      llvm::Type::getInt32Ty(getVMContext());
-    return ABIArgInfo::getDirect(ResType);
-  }
-
   if (!isAggregateTypeForABI(RetTy)) {
     // Treat an enum type as its underlying type.
     if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
diff --git a/clang/test/CodeGen/arm-fp16-arguments.c b/clang/test/CodeGen/arm-fp16-arguments.c
--- a/clang/test/CodeGen/arm-fp16-arguments.c
+++ b/clang/test/CodeGen/arm-fp16-arguments.c
@@ -1,51 +1,33 @@
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=NATIVE
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=NATIVE
 
 __fp16 g;
 
 void t1(__fp16 a) { g = a; }
-// SOFT: define void @t1(i32 [[PARAM:%.*]])
-// SOFT: [[TRUNC:%.*]] = trunc i32 [[PARAM]] to i16
-// HARD: define arm_aapcs_vfpcc void @t1(float [[PARAM:%.*]])
-// HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
-// HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
-// CHECK: store i16 [[TRUNC]], i16* bitcast (half* @g to i16*)
+// SOFT: define void @t1(half [[PARAM:%.*]])
+// HARD: define arm_aapcs_vfpcc void @t1(half [[PARAM:%.*]])
 // NATIVE: define void @t1(half [[PARAM:%.*]])
-// NATIVE: store half [[PARAM]], half* @g
+// CHECK: store half [[PARAM]], half* @g
 
 __fp16 t2() { return g; }
-// SOFT: define i32 @t2()
-// HARD: define arm_aapcs_vfpcc float @t2()
+// SOFT: define half @t2()
+// HARD: define arm_aapcs_vfpcc half @t2()
 // NATIVE: define half @t2()
-// CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @g to i16*)
-// CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
-// SOFT: ret i32 [[ZEXT]]
-// HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
-// HARD: ret float [[BITCAST]]
-// NATIVE: [[LOAD:%.*]] = load half, half* @g
-// NATIVE: ret half [[LOAD]]
+// CHECK: [[LOAD:%.*]] = load half, half* @g
+// CHECK: ret half [[LOAD]]
 
 _Float16 h;
 
 void t3(_Float16 a) { h = a; }
-// SOFT: define void @t3(i32 [[PARAM:%.*]])
-// SOFT: [[TRUNC:%.*]] = trunc i32 [[PARAM]] to i16
-// HARD: define arm_aapcs_vfpcc void @t3(float [[PARAM:%.*]])
-// HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
-// HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
-// CHECK: store i16 [[TRUNC]], i16* bitcast (half* @h to i16*)
+// SOFT: define void @t3(half [[PARAM:%.*]])
+// HARD: define arm_aapcs_vfpcc void @t3(half [[PARAM:%.*]])
 // NATIVE: define void @t3(half [[PARAM:%.*]])
-// NATIVE: store half [[PARAM]], half* @h
+// CHECK: store half [[PARAM]], half* @h
 
 _Float16 t4() { return h; }
-// SOFT: define i32 @t4()
-// HARD: define arm_aapcs_vfpcc float @t4()
+// SOFT: define half @t4()
+// HARD: define arm_aapcs_vfpcc half @t4()
 // NATIVE: define half @t4()
-// CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @h to i16*)
-// CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
-// SOFT: ret i32 [[ZEXT]]
-// HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
-// HARD: ret float [[BITCAST]]
-// NATIVE: [[LOAD:%.*]] = load half, half* @h
-// NATIVE: ret half [[LOAD]]
+// CHECK: [[LOAD:%.*]] = load half, half* @h
+// CHECK: ret half [[LOAD]]
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/compare.c b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/compare.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
@@ -134,15 +134,12 @@
 
 // CHECK-LABEL: @test_vcmpeqq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT:    ret i16 [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
 //
 mve_pred16_t test_vcmpeqq_n_f16(float16x8_t a, float16_t b)
 {
@@ -433,18 +430,15 @@
 
 // CHECK-LABEL: @test_vcmpeqq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP4:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT:    ret i16 [[TMP7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
 //
 mve_pred16_t test_vcmpeqq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
@@ -732,15 +726,12 @@
 
 // CHECK-LABEL: @test_vcmpneq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT:    ret i16 [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
 //
 mve_pred16_t test_vcmpneq_n_f16(float16x8_t a, float16_t b)
 {
@@ -1031,18 +1022,15 @@
 
 // CHECK-LABEL: @test_vcmpneq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP4:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT:    ret i16 [[TMP7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
 //
 mve_pred16_t test_vcmpneq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
@@ -1330,15 +1318,12 @@
 
 // CHECK-LABEL: @test_vcmpgeq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT:    ret i16 [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
 //
 mve_pred16_t test_vcmpgeq_n_f16(float16x8_t a, float16_t b)
 {
@@ -1629,18 +1614,15 @@
 
 // CHECK-LABEL: @test_vcmpgeq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP4:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT:    ret i16 [[TMP7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
 //
 mve_pred16_t test_vcmpgeq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
@@ -1928,15 +1910,12 @@
 
 // CHECK-LABEL: @test_vcmpgtq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT:    ret i16 [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
 //
 mve_pred16_t test_vcmpgtq_n_f16(float16x8_t a, float16_t b)
 {
@@ -2227,18 +2206,15 @@
 
 // CHECK-LABEL: @test_vcmpgtq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP4:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT:    ret i16 [[TMP7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
 //
 mve_pred16_t test_vcmpgtq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
@@ -2478,15 +2454,12 @@
 
 // CHECK-LABEL: @test_vcmpleq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT:    ret i16 [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
 //
 mve_pred16_t test_vcmpleq_n_f16(float16x8_t a, float16_t b)
 {
@@ -2666,18 +2639,15 @@
 
 // CHECK-LABEL: @test_vcmpleq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP4:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT:    ret i16 [[TMP7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
 //
 mve_pred16_t test_vcmpleq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
@@ -2854,15 +2824,12 @@
 
 // CHECK-LABEL: @test_vcmpltq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT:    ret i16 [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
 //
 mve_pred16_t test_vcmpltq_n_f16(float16x8_t a, float16_t b)
 {
@@ -3042,18 +3009,15 @@
 
 // CHECK-LABEL: @test_vcmpltq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT:    ret i16 [[TMP7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
 //
 mve_pred16_t test_vcmpltq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp b/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
--- a/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
+++ b/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
@@ -78,19 +78,12 @@
 
 // CHECK-LABEL: @_Z18test_vcmpeqq_n_f1619__simd128_float16_tDh(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[B:%.*]] = alloca half, align 2
-// CHECK-NEXT:    [[TMP:%.*]] = alloca float, align 4
-// CHECK-NEXT:    store float [[B_COERCE:%.*]], float* [[TMP]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[B]] to i8*
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP]] to i8*
-// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP0]], i8* align 4 [[TMP1]], i32 2, i1 false)
-// CHECK-NEXT:    [[B1:%.*]] = load half, half* [[B]], align 2
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT:    ret i16 [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
 //
 mve_pred16_t test_vcmpeqq_n_f16(float16x8_t a, float16_t b)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/dup.c b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/dup.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
@@ -6,10 +6,7 @@
 
 // CHECK-LABEL: @test_vdupq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x half> [[DOTSPLAT]]
 //
@@ -97,15 +94,12 @@
 
 // CHECK-LABEL: @test_vdupq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x half> [[DOTSPLAT]], <8 x half> [[INACTIVE:%.*]]
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[DOTSPLAT]], <8 x half> [[INACTIVE:%.*]]
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vdupq_m_n_f16(float16x8_t inactive, float16_t a, mve_pred16_t p)
 {
@@ -244,12 +238,9 @@
 
 // CHECK-LABEL: @test_vdupq_x_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x half> [[DOTSPLAT]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c b/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c
@@ -7,10 +7,7 @@
 // CHECK-LABEL: @test_vgetq_lane_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> [[A:%.*]], i32 2
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP_0_INSERT_EXT:%.*]] = zext i16 [[TMP1]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP_0_INSERT_EXT]] to float
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vgetq_lane_f16(float16x8_t a)
 {
@@ -149,11 +146,8 @@
 
 // CHECK-LABEL: @test_vsetq_lane_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[B:%.*]], half [[TMP1]], i32 4
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x half> [[B:%.*]], half [[A:%.*]], i32 4
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vsetq_lane_f16(float16_t a, float16x8_t b)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
@@ -32,13 +32,10 @@
 
 // CHECK-LABEL: @test_vfmaq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]])
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
 #ifdef POLYMORPHIC
@@ -65,13 +62,10 @@
 
 // CHECK-LABEL: @test_vfmasq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]])
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmasq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
 #ifdef POLYMORPHIC
@@ -319,7 +313,7 @@
 // CHECK-LABEL: @test_vqdmlahq_n_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 //
 int8x16_t test_vqdmlahq_n_s8(int8x16_t a, int8x16_t b, int8_t c) {
@@ -333,7 +327,7 @@
 // CHECK-LABEL: @test_vqdmlahq_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 //
 int16x8_t test_vqdmlahq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
@@ -346,7 +340,7 @@
 
 // CHECK-LABEL: @test_vqdmlahq_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vqdmlahq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
@@ -401,7 +395,7 @@
 // CHECK-LABEL: @test_vqrdmlahq_n_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 //
 int8x16_t test_vqrdmlahq_n_s8(int8x16_t a, int8x16_t b, int8_t c) {
@@ -415,7 +409,7 @@
 // CHECK-LABEL: @test_vqrdmlahq_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 //
 int16x8_t test_vqrdmlahq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
@@ -428,7 +422,7 @@
 
 // CHECK-LABEL: @test_vqrdmlahq_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vqrdmlahq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
@@ -512,15 +506,12 @@
 
 // CHECK-LABEL: @test_vfmaq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vfmaq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
 #ifdef POLYMORPHIC
@@ -549,15 +540,12 @@
 
 // CHECK-LABEL: @test_vfmasq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]])
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vfmasq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
 #ifdef POLYMORPHIC
@@ -621,7 +609,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 //
 int8x16_t test_vmlaq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
@@ -637,7 +625,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 int16x8_t test_vmlaq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
@@ -652,7 +640,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vmlaq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
@@ -668,7 +656,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 //
 uint8x16_t test_vmlaq_m_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c, mve_pred16_t p) {
@@ -684,7 +672,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 uint16x8_t test_vmlaq_m_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c, mve_pred16_t p) {
@@ -699,7 +687,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vmlaq_m_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c, mve_pred16_t p) {
@@ -809,7 +797,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 //
 int8x16_t test_vqdmlahq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
@@ -825,7 +813,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 int16x8_t test_vqdmlahq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
@@ -840,7 +828,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqdmlahq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
@@ -903,7 +891,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 //
 int8x16_t test_vqrdmlahq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
@@ -919,7 +907,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 int16x8_t test_vqrdmlahq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
@@ -934,7 +922,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqrdmlahq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
@@ -114,13 +114,10 @@
 
 // CHECK-LABEL: @test_vaddq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vaddq_n_f16(float16x8_t a, float16_t b)
 {
@@ -187,15 +184,12 @@
 
 // CHECK-LABEL: @test_vaddq_x_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vaddq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
@@ -264,17 +264,8 @@
 
 // CHECK-LABEL: @test_vminnmvq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.arm.mve.minnmv.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vminnmvq_f16(float16_t a, float16x8_t b) {
 #ifdef POLYMORPHIC
@@ -299,17 +290,8 @@
 
 // CHECK-LABEL: @test_vminnmavq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.arm.mve.minnmav.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vminnmavq_f16(float16_t a, float16x8_t b) {
 #ifdef POLYMORPHIC
@@ -334,17 +316,8 @@
 
 // CHECK-LABEL: @test_vmaxnmvq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.arm.mve.maxnmv.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vmaxnmvq_f16(float16_t a, float16x8_t b) {
 #ifdef POLYMORPHIC
@@ -369,17 +342,8 @@
 
 // CHECK-LABEL: @test_vmaxnmavq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.arm.mve.maxnmav.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vmaxnmavq_f16(float16_t a, float16x8_t b) {
 #ifdef POLYMORPHIC
@@ -698,19 +662,10 @@
 
 // CHECK-LABEL: @test_vminnmvq_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
 //
 float16_t test_vminnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
 #ifdef POLYMORPHIC
@@ -737,19 +692,10 @@
 
 // CHECK-LABEL: @test_vminnmavq_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
 //
 float16_t test_vminnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
 #ifdef POLYMORPHIC
@@ -776,19 +722,10 @@
 
 // CHECK-LABEL: @test_vmaxnmvq_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
 //
 float16_t test_vmaxnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
 #ifdef POLYMORPHIC
@@ -815,19 +752,10 @@
 
 // CHECK-LABEL: @test_vmaxnmavq_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT:    ret float [[TMP7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
 //
 float16_t test_vmaxnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
 #ifdef POLYMORPHIC
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
@@ -308,15 +308,12 @@
 
 // CHECK-LABEL: @test_vmulq_m_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vmulq_m_n_f16(float16x8_t inactive, float16x8_t a, float16_t b, mve_pred16_t p)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
@@ -114,13 +114,10 @@
 
 // CHECK-LABEL: @test_vsubq_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vsubq_n_f16(float16x8_t a, float16_t b)
 {
@@ -187,15 +184,12 @@
 
 // CHECK-LABEL: @test_vsubq_x_n_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vsubq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
diff --git a/clang/test/CodeGen/cmse-clear-fp16.c b/clang/test/CodeGen/cmse-clear-fp16.c
deleted file mode 100644
--- a/clang/test/CodeGen/cmse-clear-fp16.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: %clang_cc1 -triple thumbv8m.main -O0 -mcmse  -S -emit-llvm \
-// RUN:            -fallow-half-arguments-and-returns %s -o - | \
-// RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-NOPT-SOFT
-// RUN: %clang_cc1 -triple thumbv8m.main -O2 -mcmse  -S -emit-llvm \
-// RUN:            -fallow-half-arguments-and-returns %s -o - | \
-// RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-OPT-SOFT
-// RUN: %clang_cc1 -triple thumbv8m.main -O0 -mcmse  -S -emit-llvm \
-// RUN:            -fallow-half-arguments-and-returns -mfloat-abi hard %s -o - | \
-// RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-NOPT-HARD
-// RUN: %clang_cc1 -triple thumbv8m.main -O2 -mcmse  -S -emit-llvm \
-// RUN:            -fallow-half-arguments-and-returns -mfloat-abi hard %s -o - | \
-// RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-OPT-HARD
-
-__fp16 g0();
-__attribute__((cmse_nonsecure_entry)) __fp16 f0() {
-  return g0();
-}
-// CHECK:           define {{.*}}@f0()
-
-// CHECK-NOPT-SOFT: %[[V0:.*]] = load i32
-// CHECK-NOPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-SOFT: ret i32 %[[V1]]
-
-// CHECK-OPT-SOFT: %[[V0:.*]] = tail call {{.*}} @g0
-// CHECK-OPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-OPT-SOFT: ret i32 %[[V1]]
-
-// CHECK-NOPT-HARD: %[[V0:.*]] = bitcast float {{.*}} to i32
-// CHECK-NOPT-HARD: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-HARD: %[[V2:.*]] = bitcast i32 %[[V1]] to float
-// CHECK-NOPT-HARD: ret float %[[V2]]
-
-// CHECK-OPT-HARD: %[[V0:.*]] = bitcast float {{.*}} to i32
-// CHECK-OPT-HARD: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-OPT-HARD: %[[V2:.*]] = bitcast i32 %[[V1]] to float
-// CHECK-OPT-HARD: ret float %[[V2]]
-
-void __attribute__((cmse_nonsecure_call)) (*g1)(__fp16);
-__fp16 x;
-void f1() {
-  g1(x);
-}
-// CHECK: define {{.*}}@f1()
-
-// CHECK-NOPT-SOFT: %[[V0:.*]] = load i32
-// CHECK-NOPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-SOFT: call {{.*}} void {{.*}}(i32 %[[V1]])
-
-// CHECK-OPT-SOFT: %[[V1:.*]] = zext i16 {{.*}} to i32
-// CHECK-OPT-SOFT: call {{.*}} void {{.*}}(i32 %[[V1]])
-
-// CHECK-NOPT-HARD: %[[V0:.*]] = bitcast float {{.*}} to i32
-// CHECK-NOPT-HARD: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-HARD: %[[V2:.*]] = bitcast i32 %[[V1]] to float
-// CHECK-NOPT-HARD: call {{.*}}(float %[[V2]])
-
-// CHECK-OPT-HARD: %[[V0:.*]] = zext i16 {{.*}} to i32
-// CHECK-OPT-HARD: %[[V1:.*]] = bitcast i32 %[[V0]] to float
-// CHECK-OPT-HARD: call {{.*}}(float %[[V1]])