diff --git a/clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c
--- a/clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c
@@ -62,43 +62,43 @@
   return vld1_bf16_x2(ptr);
 }
 // CHECK-LABEL: test_vld1_bf16_x2
-// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK64: %vld1xN = call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld1xN = call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr)
 
 bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) {
   return vld1q_bf16_x2(ptr);
 }
 // CHECK-LABEL: test_vld1q_bf16_x2
-// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK64: %vld1xN = call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld1xN = call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr)
 
 bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) {
   return vld1_bf16_x3(ptr);
 }
 // CHECK-LABEL: test_vld1_bf16_x3
-// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK64: %vld1xN = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld1xN = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr)
 
 bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) {
   return vld1q_bf16_x3(ptr);
 }
 // CHECK-LABEL: test_vld1q_bf16_x3
-// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK64: %vld1xN = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld1xN = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr)
 
 bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) {
   return vld1_bf16_x4(ptr);
 }
 // CHECK-LABEL: test_vld1_bf16_x4
-// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK64: %vld1xN = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld1xN = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr)
 
 bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) {
   return vld1q_bf16_x4(ptr);
 }
 // CHECK-LABEL: test_vld1q_bf16_x4
-// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK64: %vld1xN = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld1xN = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr)
 
 bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) {
   return vld1q_dup_bf16(ptr);
@@ -118,139 +118,139 @@
 }
 // CHECK-LABEL: test_vld2_bf16
 // CHECK64:  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
-// CHECK64-NEXT:  %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0)
+// CHECK64-NEXT:  %vld2 = call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0)
 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
+// CHECK32-NEXT: %vld2_v = call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) {
   return vld2q_bf16(ptr);
 }
 // CHECK-LABEL: test_vld2q_bf16
 // CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>*
-// CHECK64-NEXT: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0)
+// CHECK64-NEXT: %vld2 = call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0)
 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
+// CHECK32-NEXT: %vld2q_v = call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) {
   return vld2_lane_bf16(ptr, src, 1);
 }
 // CHECK-LABEL: test_vld2_lane_bf16
-// CHECK64: %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0)
-// CHECK32: %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
+// CHECK64: %vld2_lane = call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0)
+// CHECK32: %vld2_lane_v = call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
 
 bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) {
   return vld2q_lane_bf16(ptr, src, 7);
 }
 // CHECK-LABEL: test_vld2q_lane_bf16
-// CHECK64: %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0)
-// CHECK32: %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
+// CHECK64: %vld2_lane = call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0)
+// CHECK32: %vld2q_lane_v = call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
 
 bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) {
   return vld3_bf16(ptr);
 }
 // CHECK-LABEL: test_vld3_bf16
-// CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0)
+// CHECK64: %vld3 = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0)
 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2)
+// CHECK32-NEXT: %vld3_v = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) {
   return vld3q_bf16(ptr);
 }
 // CHECK-LABEL: test_vld3q_bf16
-// CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0)
+// CHECK64: %vld3 = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0)
 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2)
+// CHECK32-NEXT: %vld3q_v = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) {
   return vld3_lane_bf16(ptr, src, 1);
 }
 // CHECK-LABEL: test_vld3_lane_bf16
-// CHECK64: %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0)
+// CHECK64: %vld3_lane = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0)
 // CHECK32: %3 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
+// CHECK32-NEXT: %vld3_lane_v = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
 
 bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) {
   return vld3q_lane_bf16(ptr, src, 7);
   // return vld3q_lane_bf16(ptr, src, 8);
 }
 // CHECK-LABEL: test_vld3q_lane_bf16
-// CHECK64: %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0)
+// CHECK64: %vld3_lane = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0)
 // CHECK32: %3 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
+// CHECK32-NEXT: %vld3q_lane_v = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
 
 bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) {
   return vld4_bf16(ptr);
 }
 // CHECK-LABEL: test_vld4_bf16
-// CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0)
+// CHECK64: %vld4 = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0)
 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2)
+// CHECK32-NEXT: %vld4_v = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) {
   return vld4q_bf16(ptr);
 }
 // CHECK-LABEL: test_vld4q_bf16
-// CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0)
+// CHECK64: %vld4 = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0)
 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2)
+// CHECK32-NEXT: %vld4q_v = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) {
   return vld4_lane_bf16(ptr, src, 1);
 }
 // CHECK-LABEL: test_vld4_lane_bf16
-// CHECK64: %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0)
+// CHECK64: %vld4_lane = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0)
 // CHECK32: %4 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
+// CHECK32-NEXT: %vld4_lane_v = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
 
 bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) {
   return vld4q_lane_bf16(ptr, src, 7);
 }
 // CHECK-LABEL: test_vld4q_lane_bf16
-// CHECK64: %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0)
+// CHECK64: %vld4_lane = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0)
 // CHECK32: %4 = bitcast bfloat* %ptr to i8*
-// CHECK32-NEXT: %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
+// CHECK32-NEXT: %vld4q_lane_v = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
 
 bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) {
   return vld2_dup_bf16(ptr);
 }
 // CHECK-LABEL: test_vld2_dup_bf16
-// CHECK64: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2)
+// CHECK64: %vld2 = call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld2_dup_v = call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) {
   return vld2q_dup_bf16(ptr);
 }
 // CHECK-LABEL: test_vld2q_dup_bf16
-// CHECK64: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2)
+// CHECK64: %vld2 = call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld2q_dup_v = call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) {
   return vld3_dup_bf16(ptr);
 }
 // CHECK-LABEL: test_vld3_dup_bf16
-// CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2)
+// CHECK64: %vld3 = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld3_dup_v = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) {
   return vld3q_dup_bf16(ptr);
 }
 // CHECK-LABEL: test_vld3q_dup_bf16
-// CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2)
+// CHECK64: %vld3 = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld3q_dup_v = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) {
   return vld4_dup_bf16(ptr);
 }
 // CHECK-LABEL: test_vld4_dup_bf16
-// CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2)
+// CHECK64: %vld4 = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld4_dup_v = call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2)
 
 bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) {
   return vld4q_dup_bf16(ptr);
 }
 // CHECK-LABEL: test_vld4q_dup_bf16
-// CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr)
-// CHECK32: %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2)
+// CHECK64: %vld4 = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr)
+// CHECK32: %vld4q_dup_v = call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2)
 
 void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
   vst1_bf16(ptr, val);
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c
@@ -61,7 +61,7 @@
 // CHECK-LABEL: @test_vmaxaq_m_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.vmaxa.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
@@ -77,7 +77,7 @@
 // CHECK-LABEL: @test_vmaxaq_m_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.vmaxa.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
@@ -93,7 +93,7 @@
 // CHECK-LABEL: @test_vmaxaq_m_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.vmaxa.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
@@ -6,8 +6,8 @@
 
 // CHECK-LABEL: @test_vmaxnmaq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -22,8 +22,8 @@
 
 // CHECK-LABEL: @test_vmaxnmaq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
@@ -39,7 +39,7 @@
 // CHECK-LABEL: @test_vmaxnmaq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -55,7 +55,7 @@
 // CHECK-LABEL: @test_vmaxnmaq_m_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
@@ -35,7 +35,7 @@
 // CHECK-LABEL: @test_vmaxnmq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -51,7 +51,7 @@
 // CHECK-LABEL: @test_vmaxnmq_m_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
@@ -67,7 +67,7 @@
 // CHECK-LABEL: @test_vmaxnmq_x_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -83,7 +83,7 @@
 // CHECK-LABEL: @test_vmaxnmq_x_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef)
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
@@ -52,7 +52,7 @@
 // CHECK-LABEL: @test_vmaxq_m_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
@@ -68,7 +68,7 @@
 // CHECK-LABEL: @test_vmaxq_m_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
@@ -84,7 +84,7 @@
 // CHECK-LABEL: @test_vmaxq_m_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@@ -100,7 +100,7 @@
 // CHECK-LABEL: @test_vmaxq_x_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
@@ -116,7 +116,7 @@
 // CHECK-LABEL: @test_vmaxq_x_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
@@ -132,7 +132,7 @@
 // CHECK-LABEL: @test_vmaxq_x_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c
@@ -61,7 +61,7 @@
 // CHECK-LABEL: @test_vminaq_m_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.vmina.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
@@ -77,7 +77,7 @@
 // CHECK-LABEL: @test_vminaq_m_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.vmina.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
@@ -93,7 +93,7 @@
 // CHECK-LABEL: @test_vminaq_m_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.vmina.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
@@ -6,8 +6,8 @@
 
 // CHECK-LABEL: @test_vminnmaq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -22,8 +22,8 @@
 
 // CHECK-LABEL: @test_vminnmaq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
@@ -39,7 +39,7 @@
 // CHECK-LABEL: @test_vminnmaq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -55,7 +55,7 @@
 // CHECK-LABEL: @test_vminnmaq_m_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
@@ -35,7 +35,7 @@
 // CHECK-LABEL: @test_vminnmq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -51,7 +51,7 @@
 // CHECK-LABEL: @test_vminnmq_m_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
@@ -67,7 +67,7 @@
 // CHECK-LABEL: @test_vminnmq_x_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
@@ -83,7 +83,7 @@
 // CHECK-LABEL: @test_vminnmq_x_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef)
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
@@ -52,7 +52,7 @@
 // CHECK-LABEL: @test_vminq_m_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
@@ -68,7 +68,7 @@
 // CHECK-LABEL: @test_vminq_m_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
@@ -84,7 +84,7 @@
 // CHECK-LABEL: @test_vminq_m_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@@ -100,7 +100,7 @@
 // CHECK-LABEL: @test_vminq_x_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
@@ -116,7 +116,7 @@
 // CHECK-LABEL: @test_vminq_x_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
@@ -132,7 +132,7 @@
 // CHECK-LABEL: @test_vminq_x_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
diff --git a/clang/test/CodeGen/arm64-mte.c b/clang/test/CodeGen/arm64-mte.c
--- a/clang/test/CodeGen/arm64-mte.c
+++ b/clang/test/CodeGen/arm64-mte.c
@@ -47,7 +47,7 @@
 unsigned exclude_tag(int *a, unsigned m) {
 // CHECK: [[T0:%[0-9]+]] = zext i32 %m to i64
 // CHECK: [[T1:%[0-9]+]] = bitcast i32* %a to i8*
-// CHECK: [[T2:%[0-9]+]] = tail call i64 @llvm.aarch64.gmi(i8* [[T1]], i64 [[T0]])
+// CHECK: [[T2:%[0-9]+]] = call i64 @llvm.aarch64.gmi(i8* [[T1]], i64 [[T0]])
 // CHECK: trunc i64 [[T2]] to i32
   return __arm_mte_exclude_tag(a, m);
 }
@@ -103,7 +103,7 @@
 // Check arithmetic promotion on return type
 // CHECK-LABEL: define i32 @subtract_pointers4
 int subtract_pointers4(void* a, void *b) {
-// CHECK: [[T0:%[0-9]+]] = tail call i64 @llvm.aarch64.subp(i8* %a, i8* %b)
+// CHECK: [[T0:%[0-9]+]] = call i64 @llvm.aarch64.subp(i8* %a, i8* %b)
 // CHECK-NEXT: %cmp = icmp slt i64 [[T0]], 1
 // CHECK-NEXT:  = zext i1 %cmp to i32
   return __arm_mte_ptrdiff(a,b) <= 0;
diff --git a/clang/test/CodeGen/builtins-multiprecision.c b/clang/test/CodeGen/builtins-multiprecision.c
--- a/clang/test/CodeGen/builtins-multiprecision.c
+++ b/clang/test/CodeGen/builtins-multiprecision.c
@@ -5,10 +5,10 @@
 unsigned char test_addcb(unsigned char x, unsigned char y,
                          unsigned char carryin, unsigned char *z) {
   // CHECK: @test_addcb
-  // CHECK: %{{.+}} = {{.*}} call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 %x, i8 %y)
+  // CHECK: %{{.+}} = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 %x, i8 %y)
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 %{{.+}}, i8 %carryin)
+  // CHECK: %{{.+}} = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 %{{.+}}, i8 %carryin)
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -24,10 +24,10 @@
 unsigned short test_addcs(unsigned short x, unsigned short y,
                           unsigned short carryin, unsigned short *z) {
   // CHECK: @test_addcs
-  // CHECK: %{{.+}} = {{.*}} call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %x, i16 %y)
+  // CHECK: %{{.+}} = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %x, i16 %y)
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %{{.+}}, i16 %carryin)
+  // CHECK: %{{.+}} = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %{{.+}}, i16 %carryin)
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -42,10 +42,10 @@
 
 unsigned test_addc(unsigned x, unsigned y, unsigned carryin, unsigned *z) {
   // CHECK: @test_addc
-  // CHECK: %{{.+}} = {{.*}} call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
+  // CHECK: %{{.+}} = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %{{.+}}, i32 %carryin)
+  // CHECK: %{{.+}} = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %{{.+}}, i32 %carryin)
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -61,10 +61,10 @@
                          unsigned long carryin, unsigned long *z) {
   // long is i32 on i686, i64 on x86_64.
   // CHECK: @test_addcl([[UL:i32|i64]] %x
-  // CHECK: %{{.+}} = {{.*}} call { [[UL]], i1 } @llvm.uadd.with.overflow.[[UL]]([[UL]] %x, [[UL]] %y)
+  // CHECK: %{{.+}} = call { [[UL]], i1 } @llvm.uadd.with.overflow.[[UL]]([[UL]] %x, [[UL]] %y)
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { [[UL]], i1 } @llvm.uadd.with.overflow.[[UL]]([[UL]] %{{.+}}, [[UL]] %carryin)
+  // CHECK: %{{.+}} = call { [[UL]], i1 } @llvm.uadd.with.overflow.[[UL]]([[UL]] %{{.+}}, [[UL]] %carryin)
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -80,10 +80,10 @@
                                unsigned long long carryin,
                                unsigned long long *z) {
   // CHECK: @test_addcll
-  // CHECK: %{{.+}} = {{.*}} call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x, i64 %y)
+  // CHECK: %{{.+}} = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x, i64 %y)
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %{{.+}}, i64 %carryin)
+  // CHECK: %{{.+}} = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %{{.+}}, i64 %carryin)
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -98,10 +98,10 @@
 unsigned char test_subcb(unsigned char x, unsigned char y,
                          unsigned char carryin, unsigned char *z) {
   // CHECK: @test_subcb
-  // CHECK: %{{.+}} = {{.*}} call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y)
+  // CHECK: %{{.+}} = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y)
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %{{.+}}, i8 %carryin)
+  // CHECK: %{{.+}} = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %{{.+}}, i8 %carryin)
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i8, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -117,10 +117,10 @@
 unsigned short test_subcs(unsigned short x, unsigned short y,
                           unsigned short carryin, unsigned short *z) {
   // CHECK: @test_subcs
-  // CHECK: %{{.+}} = {{.*}} call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %x, i16 %y)
+  // CHECK: %{{.+}} = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %x, i16 %y)
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %{{.+}}, i16 %carryin)
+  // CHECK: %{{.+}} = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %{{.+}}, i16 %carryin)
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i16, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -135,10 +135,10 @@
 
 unsigned test_subc(unsigned x, unsigned y, unsigned carryin, unsigned *z) {
   // CHECK: @test_subc
-  // CHECK: %{{.+}} = {{.*}} call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 %y)
+  // CHECK: %{{.+}} = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 %y)
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %{{.+}}, i32 %carryin)
+  // CHECK: %{{.+}} = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %{{.+}}, i32 %carryin)
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i32, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -153,10 +153,10 @@
 unsigned long test_subcl(unsigned long x, unsigned long y,
                          unsigned long carryin, unsigned long *z) {
   // CHECK: @test_subcl([[UL:i32|i64]] %x
-  // CHECK: %{{.+}} = {{.*}} call { [[UL]], i1 } @llvm.usub.with.overflow.[[UL]]([[UL]] %x, [[UL]] %y)
+  // CHECK: %{{.+}} = call { [[UL]], i1 } @llvm.usub.with.overflow.[[UL]]([[UL]] %x, [[UL]] %y)
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { [[UL]], i1 } @llvm.usub.with.overflow.[[UL]]([[UL]] %{{.+}}, [[UL]] %carryin)
+  // CHECK: %{{.+}} = call { [[UL]], i1 } @llvm.usub.with.overflow.[[UL]]([[UL]] %{{.+}}, [[UL]] %carryin)
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { [[UL]], i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
@@ -172,10 +172,10 @@
                                unsigned long long carryin,
                                unsigned long long *z) {
   // CHECK: @test_subcll
-  // CHECK: %{{.+}} = {{.*}} call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+  // CHECK: %{{.+}} = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 0
-  // CHECK: %{{.+}} = {{.*}} call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %{{.+}}, i64 %carryin)
+  // CHECK: %{{.+}} = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %{{.+}}, i64 %carryin)
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 1
   // CHECK: %{{.+}} = extractvalue { i64, i1 } %{{.+}}, 0
   // CHECK: %{{.+}} = or i1 %{{.+}}, %{{.+}}
diff --git a/clang/test/CodeGen/builtins-systemz-zvector-constrained.c b/clang/test/CodeGen/builtins-systemz-zvector-constrained.c
--- a/clang/test/CodeGen/builtins-systemz-zvector-constrained.c
+++ b/clang/test/CodeGen/builtins-systemz-zvector-constrained.c
@@ -205,7 +205,7 @@
   // CHECK-ASM: vflpdb
 
   vd = vec_nabs(vd);
-  // CHECK: [[ABS:%[^ ]+]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+  // CHECK: [[ABS:%[^ ]+]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
   // CHECK-NEXT: fneg <2 x double> [[ABS]]
   // CHECK-ASM: vflndb
 
@@ -225,7 +225,7 @@
   // CHECK: call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float> [[VAL]], metadata !{{.*}})
   // (emulated)
   vec_st2f(vd, ptrf);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double> %{{.*}}, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double> %{{.*}}, metadata !{{.*}})
   // CHECK: store <2 x float> [[VAL]], <2 x float>* %{{.*}}
   // (emulated)
 
@@ -236,19 +236,19 @@
   // CHECK: call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
   // (emulated)
   vd = vec_ctd(vsl, 1);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
   // CHECK: call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[VAL]], <2 x double> <double 5.000000e-01, double 5.000000e-01>, metadata !{{.*}})
   // (emulated)
   vd = vec_ctd(vul, 1);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
   // CHECK: call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[VAL]], <2 x double> <double 5.000000e-01, double 5.000000e-01>, metadata !{{.*}})
   // (emulated)
   vd = vec_ctd(vsl, 31);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
   // CHECK: call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[VAL]], <2 x double> <double 0x3E00000000000000, double 0x3E00000000000000>, metadata !{{.*}})
   // (emulated)
   vd = vec_ctd(vul, 31);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %{{.*}}, metadata !{{.*}})
   // CHECK: call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[VAL]], <2 x double> <double 0x3E00000000000000, double 0x3E00000000000000>, metadata !{{.*}})
   // (emulated)
 
@@ -259,19 +259,19 @@
   // CHECK: call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double> %{{.*}}, metadata !{{.*}})
   // (emulated)
   vsl = vec_ctsl(vd, 1);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> {{.*}}, <2 x double> <double 2.000000e+00, double 2.000000e+00>, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> {{.*}}, <2 x double> <double 2.000000e+00, double 2.000000e+00>, metadata !{{.*}})
   // CHECK: call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double> [[VAL]], metadata !{{.*}})
   // (emulated)
   vul = vec_ctul(vd, 1);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %{{.*}}, <2 x double> <double 2.000000e+00, double 2.000000e+00>, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %{{.*}}, <2 x double> <double 2.000000e+00, double 2.000000e+00>, metadata !{{.*}})
   // CHECK: call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double> [[VAL]], metadata !{{.*}})
   // (emulated)
   vsl = vec_ctsl(vd, 31);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %{{.*}}, <2 x double> <double 0x41E0000000000000, double 0x41E0000000000000>, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %{{.*}}, <2 x double> <double 0x41E0000000000000, double 0x41E0000000000000>, metadata !{{.*}})
   // CHECK: call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double> [[VAL]], metadata !{{.*}})
   // (emulated)
   vul = vec_ctul(vd, 31);
-  // CHECK: [[VAL:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %{{.*}}, <2 x double> <double 0x41E0000000000000, double 0x41E0000000000000>, metadata !{{.*}})
+  // CHECK: [[VAL:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %{{.*}}, <2 x double> <double 0x41E0000000000000, double 0x41E0000000000000>, metadata !{{.*}})
   // CHECK: call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double> [[VAL]], metadata !{{.*}})
   // (emulated)
 
diff --git a/clang/test/CodeGen/builtins-systemz-zvector.c b/clang/test/CodeGen/builtins-systemz-zvector.c
--- a/clang/test/CodeGen/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/builtins-systemz-zvector.c
@@ -4441,7 +4441,7 @@
   // CHECK-ASM: vflpdb
 
   vd = vec_nabs(vd);
-  // CHECK: [[ABS:%[^ ]+]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+  // CHECK: [[ABS:%[^ ]+]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
   // CHECK-NEXT: fneg <2 x double> [[ABS]]
   // CHECK-ASM: vflndb
 
diff --git a/clang/test/CodeGen/builtins-systemz-zvector2-constrained.c b/clang/test/CodeGen/builtins-systemz-zvector2-constrained.c
--- a/clang/test/CodeGen/builtins-systemz-zvector2-constrained.c
+++ b/clang/test/CodeGen/builtins-systemz-zvector2-constrained.c
@@ -382,11 +382,11 @@
   // CHECK-ASM: vflpdb
 
   vf = vec_nabs(vf);
-  // CHECK: [[ABS:%[^ ]+]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
+  // CHECK: [[ABS:%[^ ]+]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
   // CHECK-NEXT: fneg <4 x float> [[ABS]]
   // CHECK-ASM: vflnsb
   vd = vec_nabs(vd);
-  // CHECK: [[ABS:%[^ ]+]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+  // CHECK: [[ABS:%[^ ]+]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
   // CHECK-NEXT: fneg <2 x double> [[ABS]]
   // CHECK-ASM: vflndb
 
@@ -421,22 +421,22 @@
   // CHECK-ASM: vfmsdb
 
   vf = vec_nmadd(vf, vf, vf);
-  // CHECK: [[RES:%[^ ]+]] = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, metadata !{{.*}})
+  // CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, metadata !{{.*}})
   // CHECK: fneg <4 x float> [[RES]]
   // CHECK-ASM: vfnmasb
   vd = vec_nmadd(vd, vd, vd);
-  // CHECK: [[RES:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, metadata !{{.*}})
+  // CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, metadata !{{.*}})
   // CHECK: fneg <2 x double> [[RES]]
   // CHECK-ASM: vfnmadb
 
   vf = vec_nmsub(vf, vf, vf);
   // CHECK: [[NEG:%[^ ]+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[RES:%[^ ]+]] = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]], metadata !{{.*}})
+  // CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]], metadata !{{.*}})
   // CHECK: fneg <4 x float> [[RES]]
   // CHECK-ASM: vfnmssb
   vd = vec_nmsub(vd, vd, vd);
   // CHECK: [[NEG:%[^ ]+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[RES:%[^ ]+]] = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]], metadata !{{.*}})
+  // CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]], metadata !{{.*}})
   // CHECK: fneg <2 x double> [[RES]]
   // CHECK-ASM: vfnmsdb
 
diff --git a/clang/test/CodeGen/builtins-systemz-zvector2.c b/clang/test/CodeGen/builtins-systemz-zvector2.c
--- a/clang/test/CodeGen/builtins-systemz-zvector2.c
+++ b/clang/test/CodeGen/builtins-systemz-zvector2.c
@@ -685,11 +685,11 @@
   // CHECK-ASM: vflpdb
 
   vf = vec_nabs(vf);
-  // CHECK: [[ABS:%[^ ]+]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
+  // CHECK: [[ABS:%[^ ]+]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
   // CHECK-NEXT: fneg <4 x float> [[ABS]]
   // CHECK-ASM: vflnsb
   vd = vec_nabs(vd);
-  // CHECK: [[ABS:%[^ ]+]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+  // CHECK: [[ABS:%[^ ]+]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
   // CHECK-NEXT: fneg <2 x double> [[ABS]]
   // CHECK-ASM: vflndb
 
@@ -724,22 +724,22 @@
   // CHECK-ASM: vfmsdb
 
   vf = vec_nmadd(vf, vf, vf);
-  // CHECK: [[RES:%[^ ]+]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   // CHECK: fneg <4 x float> [[RES]]
   // CHECK-ASM: vfnmasb
   vd = vec_nmadd(vd, vd, vd);
-  // CHECK: [[RES:%[^ ]+]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   // CHECK: fneg <2 x double> [[RES]]
   // CHECK-ASM: vfnmadb
 
   vf = vec_nmsub(vf, vf, vf);
   // CHECK: [[NEG:%[^ ]+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[RES:%[^ ]+]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
+  // CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
   // CHECK: fneg <4 x float> [[RES]]
   // CHECK-ASM: vfnmssb
   vd = vec_nmsub(vd, vd, vd);
   // CHECK: [[NEG:%[^ ]+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[RES:%[^ ]+]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]])
+  // CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]])
   // CHECK: fneg <2 x double> [[RES]]
   // CHECK-ASM: vfnmsdb
 
diff --git a/clang/test/CodeGen/cmse-clear-fp16.c b/clang/test/CodeGen/cmse-clear-fp16.c
--- a/clang/test/CodeGen/cmse-clear-fp16.c
+++ b/clang/test/CodeGen/cmse-clear-fp16.c
@@ -21,7 +21,7 @@
 // CHECK-NOPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
 // CHECK-NOPT-SOFT: ret i32 %[[V1]]
 
-// CHECK-OPT-SOFT: %[[V0:.*]] = tail call {{.*}} @g0
+// CHECK-OPT-SOFT: %[[V0:.*]] = call {{.*}} @g0
 // CHECK-OPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
 // CHECK-OPT-SOFT: ret i32 %[[V1]]
 
diff --git a/clang/test/CodeGen/inline-asm-x86-flag-output.c b/clang/test/CodeGen/inline-asm-x86-flag-output.c
--- a/clang/test/CodeGen/inline-asm-x86-flag-output.c
+++ b/clang/test/CodeGen/inline-asm-x86-flag-output.c
@@ -2,7 +2,7 @@
 
 int test_cca(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_cca
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@cca},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@cca},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@cca"(x), "=m"(*(volatile long *)(addr))
@@ -15,7 +15,7 @@
 
 int test_ccae(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccae
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccae},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccae},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccae"(x), "=m"(*(volatile long *)(addr))
@@ -28,7 +28,7 @@
 
 int test_ccb(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccb
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccb},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccb},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccb"(x), "=m"(*(volatile long *)(addr))
@@ -41,7 +41,7 @@
 
 int test_ccbe(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccbe
-  //CHECK: tail call i32 asm "cmp $2,$1", "={@ccbe},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: call i32 asm "cmp $2,$1", "={@ccbe},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccbe"(x), "=m"(*(volatile long *)(addr))
@@ -54,7 +54,7 @@
 
 int test_ccc(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccc
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccc},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccc},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccc"(x), "=m"(*(volatile long *)(addr))
@@ -67,7 +67,7 @@
 
 int test_cce(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_cce
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@cce},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@cce},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@cce"(x), "=m"(*(volatile long *)(addr))
@@ -80,7 +80,7 @@
 
 int test_ccz(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccz
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccz},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccz},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccz"(x), "=m"(*(volatile long *)(addr))
@@ -93,7 +93,7 @@
 
 int test_ccg(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccg
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccg},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccg},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccg"(x), "=m"(*(volatile long *)(addr))
@@ -106,7 +106,7 @@
 
 int test_ccge(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccge
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccge},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccge},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccge"(x), "=m"(*(volatile long *)(addr))
@@ -119,7 +119,7 @@
 
 int test_ccl(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccl
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccl},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccl},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccl"(x), "=m"(*(volatile long *)(addr))
@@ -132,7 +132,7 @@
 
 int test_ccle(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccle
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccle},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccle},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccle"(x), "=m"(*(volatile long *)(addr))
@@ -145,7 +145,7 @@
 
 int test_ccna(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccna
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccna},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccna},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccna"(x), "=m"(*(volatile long *)(addr))
@@ -158,7 +158,7 @@
 
 int test_ccnae(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnae
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnae},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnae},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnae"(x), "=m"(*(volatile long *)(addr))
@@ -171,7 +171,7 @@
 
 int test_ccnb(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnb
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnb},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnb},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnb"(x), "=m"(*(volatile long *)(addr))
@@ -184,7 +184,7 @@
 
 int test_ccnbe(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnbe
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnbe},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnbe},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnbe"(x), "=m"(*(volatile long *)(addr))
@@ -197,7 +197,7 @@
 
 int test_ccnc(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnc
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnc},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnc},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnc"(x), "=m"(*(volatile long *)(addr))
@@ -210,7 +210,7 @@
 
 int test_ccne(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccne
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccne},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccne},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccne"(x), "=m"(*(volatile long *)(addr))
@@ -223,7 +223,7 @@
 
 int test_ccnz(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnz
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnz},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnz},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnz"(x), "=m"(*(volatile long *)(addr))
@@ -236,7 +236,7 @@
 
 int test_ccng(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccng
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccng},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccng},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccng"(x), "=m"(*(volatile long *)(addr))
@@ -249,7 +249,7 @@
 
 int test_ccnge(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnge
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnge},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnge},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnge"(x), "=m"(*(volatile long *)(addr))
@@ -262,7 +262,7 @@
 
 int test_ccnl(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnl
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnl},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnl},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnl"(x), "=m"(*(volatile long *)(addr))
@@ -275,7 +275,7 @@
 
 int test_ccnle(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnle
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnle},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnle},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnle"(x), "=m"(*(volatile long *)(addr))
@@ -288,7 +288,7 @@
 
 int test_ccno(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccno
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccno},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccno},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccno"(x), "=m"(*(volatile long *)(addr))
@@ -301,7 +301,7 @@
 
 int test_ccnp(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccnp
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccnp},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccnp},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccnp"(x), "=m"(*(volatile long *)(addr))
@@ -314,7 +314,7 @@
 
 int test_ccns(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccns
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccns},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccns},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccns"(x), "=m"(*(volatile long *)(addr))
@@ -327,7 +327,7 @@
 
 int test_cco(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_cco
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@cco},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@cco},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@cco"(x), "=m"(*(volatile long *)(addr))
@@ -340,7 +340,7 @@
 
 int test_ccp(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccp
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccp},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccp},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccp"(x), "=m"(*(volatile long *)(addr))
@@ -353,7 +353,7 @@
 
 int test_ccs(long nr, volatile long *addr) {
   //CHECK-LABEL: @test_ccs
-  //CHECK: = tail call i32 asm "cmp $2,$1", "={@ccs},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
+  //CHECK: = call i32 asm "cmp $2,$1", "={@ccs},=*m,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64* %addr, i64 %nr)
   int x;
   asm("cmp %2,%1"
       : "=@ccs"(x), "=m"(*(volatile long *)(addr))
@@ -366,7 +366,7 @@
 
 _Bool check_no_clobber_conflicts() {
   //CHECK-LABEL: @check_no_clobber_conflicts
-  //CHECK:  = tail call i8 asm "", "={@cce},~{cx},~{dirflag},~{fpsr},~{flags}"()
+  //CHECK:  = call i8 asm "", "={@cce},~{cx},~{dirflag},~{fpsr},~{flags}"()
   _Bool b;
   asm(""
       : "=@cce"(b)
diff --git a/clang/test/CodeGen/ms-intrinsics-other.c b/clang/test/CodeGen/ms-intrinsics-other.c
--- a/clang/test/CodeGen/ms-intrinsics-other.c
+++ b/clang/test/CodeGen/ms-intrinsics-other.c
@@ -31,7 +31,7 @@
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[INDEX:%[0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %Mask, i1 true)
+// CHECK:   [[INDEX:%[0-9]+]] = call i32 @llvm.cttz.i32(i32 %Mask, i1 true)
 // CHECK:   store i32 [[INDEX]], i32* %Index, align 4
 // CHECK:   br label %[[END_LABEL]]
 
@@ -45,7 +45,7 @@
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[REVINDEX:%[0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
+// CHECK:   [[REVINDEX:%[0-9]+]] = call i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
 // CHECK:   [[INDEX:%[0-9]+]] = xor i32 [[REVINDEX]], 31
 // CHECK:   store i32 [[INDEX]], i32* %Index, align 4
 // CHECK:   br label %[[END_LABEL]]
@@ -61,7 +61,7 @@
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[INDEX:%[0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %Mask, i1 true)
+// CHECK:   [[INDEX:%[0-9]+]] = call i64 @llvm.cttz.i64(i64 %Mask, i1 true)
 // CHECK:   [[TRUNC_INDEX:%[0-9]+]] = trunc i64 [[INDEX]] to i32
 // CHECK:   store i32 [[TRUNC_INDEX]], i32* %Index, align 4
 // CHECK:   br label %[[END_LABEL]]
@@ -76,7 +76,7 @@
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[REVINDEX:%[0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %Mask, i1 true)
+// CHECK:   [[REVINDEX:%[0-9]+]] = call i64 @llvm.ctlz.i64(i64 %Mask, i1 true)
 // CHECK:   [[TRUNC_REVINDEX:%[0-9]+]] = trunc i64 [[REVINDEX]] to i32
 // CHECK:   [[INDEX:%[0-9]+]] = xor i32 [[TRUNC_REVINDEX]], 63
 // CHECK:   store i32 [[INDEX]], i32* %Index, align 4
diff --git a/clang/test/CodeGen/ms-intrinsics.c b/clang/test/CodeGen/ms-intrinsics.c
--- a/clang/test/CodeGen/ms-intrinsics.c
+++ b/clang/test/CodeGen/ms-intrinsics.c
@@ -156,7 +156,7 @@
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[INDEX:%[0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %Mask, i1 true)
+// CHECK:   [[INDEX:%[0-9]+]] = call i32 @llvm.cttz.i32(i32 %Mask, i1 true)
 // CHECK:   store i32 [[INDEX]], i32* %Index, align 4
 // CHECK:   br label %[[END_LABEL]]
 
@@ -170,7 +170,7 @@
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[REVINDEX:%[0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
+// CHECK:   [[REVINDEX:%[0-9]+]] = call i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
 // CHECK:   [[INDEX:%[0-9]+]] = xor i32 [[REVINDEX]], 31
 // CHECK:   store i32 [[INDEX]], i32* %Index, align 4
 // CHECK:   br label %[[END_LABEL]]
@@ -186,7 +186,7 @@
 // CHECK-ARM-X64:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK-ARM-X64:   ret i8 [[RESULT]]
 // CHECK-ARM-X64:   [[ISNOTZERO_LABEL]]:
-// CHECK-ARM-X64:   [[INDEX:%[0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %Mask, i1 true)
+// CHECK-ARM-X64:   [[INDEX:%[0-9]+]] = call i64 @llvm.cttz.i64(i64 %Mask, i1 true)
 // CHECK-ARM-X64:   [[TRUNC_INDEX:%[0-9]+]] = trunc i64 [[INDEX]] to i32
 // CHECK-ARM-X64:   store i32 [[TRUNC_INDEX]], i32* %Index, align 4
 // CHECK-ARM-X64:   br label %[[END_LABEL]]
@@ -201,7 +201,7 @@
 // CHECK-ARM-X64:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK-ARM-X64:   ret i8 [[RESULT]]
 // CHECK-ARM-X64:   [[ISNOTZERO_LABEL]]:
-// CHECK-ARM-X64:   [[REVINDEX:%[0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %Mask, i1 true)
+// CHECK-ARM-X64:   [[REVINDEX:%[0-9]+]] = call i64 @llvm.ctlz.i64(i64 %Mask, i1 true)
 // CHECK-ARM-X64:   [[TRUNC_REVINDEX:%[0-9]+]] = trunc i64 [[REVINDEX]] to i32
 // CHECK-ARM-X64:   [[INDEX:%[0-9]+]] = xor i32 [[TRUNC_REVINDEX]], 63
 // CHECK-ARM-X64:   store i32 [[INDEX]], i32* %Index, align 4
diff --git a/clang/test/CodeGen/systemz-inline-asm.c b/clang/test/CodeGen/systemz-inline-asm.c
--- a/clang/test/CodeGen/systemz-inline-asm.c
+++ b/clang/test/CodeGen/systemz-inline-asm.c
@@ -126,6 +126,6 @@
 // CHECK: define void @test_f128(fp128* noalias nocapture sret align 8 [[DEST:%.*]], fp128* nocapture readonly %0, fp128* nocapture readonly %1)
 // CHECK: %f = load fp128, fp128* %0
 // CHECK: %g = load fp128, fp128* %1
-// CHECK: [[RESULT:%.*]] = tail call fp128 asm "axbr $0, $2", "=f,0,f"(fp128 %f, fp128 %g)
+// CHECK: [[RESULT:%.*]] = call fp128 asm "axbr $0, $2", "=f,0,f"(fp128 %f, fp128 %g)
 // CHECK: store fp128 [[RESULT]], fp128* [[DEST]]
 }
diff --git a/clang/test/CodeGenCXX/ARM/exception-alignment.cpp b/clang/test/CodeGenCXX/ARM/exception-alignment.cpp
--- a/clang/test/CodeGenCXX/ARM/exception-alignment.cpp
+++ b/clang/test/CodeGenCXX/ARM/exception-alignment.cpp
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple armv8-arm-none-eabi -emit-llvm -target-cpu generic -Os -fcxx-exceptions -o - -x c++ %s | FileCheck --check-prefixes=CHECK,A8 %s
 // RUN: %clang_cc1 -triple armv8-unknown-linux-android -emit-llvm -target-cpu generic -Os -fcxx-exceptions -o - -x c++ %s | FileCheck --check-prefixes=CHECK,A16 %s
 
-// CHECK: [[E:%[A-z0-9]+]] = tail call i8* @__cxa_allocate_exception
+// CHECK: [[E:%[A-z0-9]+]] = call i8* @__cxa_allocate_exception
 // CHECK-NEXT: [[BC:%[A-z0-9]+]] = bitcast i8* [[E]] to <2 x i64>*
 // A8-NEXT: store <2 x i64> <i64 1, i64 2>, <2 x i64>* [[BC]], align 8
 // A16-NEXT: store <2 x i64> <i64 1, i64 2>, <2 x i64>* [[BC]], align 16
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
@@ -15,7 +15,7 @@
 // CHECK-NEXT:   br i1 [[isnull]], label %[[dynamic_cast_end:[a-z0-9._]+]], label %[[dynamic_cast_notnull:[a-z0-9._]+]]
 // CHECK:      [[dynamic_cast_notnull]]:
 // CHECK-NEXT:   [[a:%[0-9]+]] = bitcast %class.A* %a to i8*
-// CHECK-NEXT:   [[as_b:%[0-9]+]] = tail call i8* @__dynamic_cast(i8* nonnull [[a]], i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*), i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1B to i8*), i64 0)
+// CHECK-NEXT:   [[as_b:%[0-9]+]] = call i8* @__dynamic_cast(i8* nonnull [[a]], i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*), i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1B to i8*), i64 0)
 // CHECK-NEXT:   [[b:%[0-9]+]] = bitcast i8* [[as_b]] to %class.B*
 // CHECK-NEXT:   br label %[[dynamic_cast_end]]
 // CHECK:      [[dynamic_cast_end]]:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp
@@ -17,7 +17,7 @@
 // CHECK-NEXT:   [[this:%.+]] = bitcast i8* [[this_adj]] to i8**
 // CHECK-NEXT:   [[vtable:%.+]] = load i8*, i8** [[this]], align 8
 // CHECK-NEXT:   [[offset:%.+]] = add i64 [[fn_ptr]], -1
-// CHECK-NEXT:   [[ptr:%.+]] = tail call i8* @llvm.load.relative.i64(i8* [[vtable]], i64 [[offset]])
+// CHECK-NEXT:   [[ptr:%.+]] = call i8* @llvm.load.relative.i64(i8* [[vtable]], i64 [[offset]])
 // CHECK-NEXT:   [[method:%.+]] = bitcast i8* [[ptr]] to void (%class.A*)*
 // CHECK-NEXT:   br label %[[memptr_end:.+]]
 // CHECK:      [[nonvirt]]:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
@@ -40,7 +40,7 @@
 // CHECK:      [[end]]:
 // CHECK-NEXT:   [[type_info_ptr3:%[0-9]+]] = bitcast %class.A* %a to i8**
 // CHECK-NEXT:   [[vtable:%[a-z0-9]+]] = load i8*, i8** [[type_info_ptr3]]
-// CHECK-NEXT:   [[type_info_ptr:%[0-9]+]] = tail call i8* @llvm.load.relative.i32(i8* [[vtable]], i32 -4)
+// CHECK-NEXT:   [[type_info_ptr:%[0-9]+]] = call i8* @llvm.load.relative.i32(i8* [[vtable]], i32 -4)
 // CHECK-NEXT:   [[type_info_ptr2:%[0-9]+]] = bitcast i8* [[type_info_ptr]] to %"class.std::type_info"**
 // CHECK-NEXT:   [[type_info_ptr:%[0-9]+]] = load %"class.std::type_info"*, %"class.std::type_info"** [[type_info_ptr2]], align 8
 // CHECK-NEXT:   [[name_ptr:%[a-z0-9._]+]] = getelementptr inbounds %"class.std::type_info", %"class.std::type_info"* [[type_info_ptr]], i64 0, i32 1
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp
@@ -6,7 +6,7 @@
 // CHECK-NEXT: entry:
 // CHECK-NEXT:   [[this:%[0-9]+]] = bitcast %class.A* %a to i8**
 // CHECK-NEXT:   %vtable1 = load i8*, i8** [[this]]
-// CHECK-NEXT:   [[func_ptr:%[0-9]+]] = tail call i8* @llvm.load.relative.i32(i8* %vtable1, i32 0)
+// CHECK-NEXT:   [[func_ptr:%[0-9]+]] = call i8* @llvm.load.relative.i32(i8* %vtable1, i32 0)
 // CHECK-NEXT:   [[func:%[0-9]+]] = bitcast i8* [[func_ptr]] to void (%class.A*)*
 // CHECK-NEXT:   tail call void [[func]](%class.A* %a)
 // CHECK-NEXT:   ret void
diff --git a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
--- a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
+++ b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
@@ -15,15 +15,15 @@
 
 // PR10789: different platforms have different sizes for struct UnwindException.
 
-// X86-64:          [[T0:%.*]] = tail call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// X86-64:          [[T0:%.*]] = call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // X86-64-NEXT:     [[T1:%.*]] = getelementptr i8, i8* [[EXN]], i64 32
-// X86-32:          [[T0:%.*]] = tail call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// X86-32:          [[T0:%.*]] = call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // X86-32-NEXT:     [[T1:%.*]] = getelementptr i8, i8* [[EXN]], i64 32
-// ARM-DARWIN:      [[T0:%.*]] = tail call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// ARM-DARWIN:      [[T0:%.*]] = call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // ARM-DARWIN-NEXT: [[T1:%.*]] = getelementptr i8, i8* [[EXN]], i64 32
-// ARM-EABI:        [[T0:%.*]] = tail call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// ARM-EABI:        [[T0:%.*]] = call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // ARM-EABI-NEXT:   [[T1:%.*]] = getelementptr i8, i8* [[EXN]], i32 88
-// MIPS:            [[T0:%.*]] = tail call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// MIPS:            [[T0:%.*]] = call i8* @__cxa_begin_catch(i8* [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // MIPS-NEXT:       [[T1:%.*]] = getelementptr i8, i8* [[EXN]], i32 24
 
 // X86-64: attributes [[NUW]] = { nounwind }
diff --git a/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm b/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
--- a/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
+++ b/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
@@ -26,14 +26,14 @@
 
 extern "C" void single() { function({ [I new] }); }
 
-// CHECK: [[INSTANCE:%.*]] = {{.*}} call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
+// CHECK: [[INSTANCE:%.*]] = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
 // CHECK-NEXT: [[CAST:%.*]] = bitcast [{{[0-9]+}} x %0*]* %{{.*}} to i8**
 // CHECK-NEXT: store i8* [[INSTANCE]], i8** [[CAST]],
 // CHECK: call void @llvm.objc.release(i8* {{.*}})
 
 extern "C" void multiple() { function({ [I new], [I new] }); }
 
-// CHECK: [[INSTANCE:%.*]] = {{.*}} call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
+// CHECK: [[INSTANCE:%.*]] = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
 // CHECK-NEXT: [[CAST:%.*]] = bitcast [{{[0-9]+}} x %0*]* %{{.*}} to i8**
 // CHECK-NEXT: store i8* [[INSTANCE]], i8** [[CAST]],
 // CHECK: call void @llvm.objc.release(i8* {{.*}})
@@ -56,13 +56,13 @@
   external();
 }
 
-// CHECK: [[INSTANCE:%.*]] = {{.*}} call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
+// CHECK: [[INSTANCE:%.*]] = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
 // CHECK: {{.*}} call void @_Z8externalv()
 // CHECK: {{.*}} call void @llvm.objc.release(i8* {{.*}})
 
 std::initializer_list<I *> il = { [I new] };
 
-// CHECK: [[POOL:%.*]] = {{.*}} call i8* @llvm.objc.autoreleasePoolPush()
-// CHECK: [[INSTANCE:%.*]] = {{.*}} call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
+// CHECK: [[POOL:%.*]] = call i8* @llvm.objc.autoreleasePoolPush()
+// CHECK: [[INSTANCE:%.*]] = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
 // CHECK-NEXT: store i8* [[INSTANCE]], i8** bitcast ([1 x %0*]* @_ZGR2il_ to i8**)
 // CHECK: {{.*}} call void @llvm.objc.autoreleasePoolPop(i8* [[POOL]])
diff --git a/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl b/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
--- a/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
@@ -4,10 +4,10 @@
 // CHECK-LABEL: @use_flat_scratch_name
 kernel void use_flat_scratch_name()
 {
-// CHECK: tail call void asm sideeffect "s_mov_b64 flat_scratch, 0", "~{flat_scratch}"()
+// CHECK: call void asm sideeffect "s_mov_b64 flat_scratch, 0", "~{flat_scratch}"()
   __asm__ volatile("s_mov_b64 flat_scratch, 0" : : : "flat_scratch");
 
-// CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_lo, 0", "~{flat_scratch_lo}"()
+// CHECK: call void asm sideeffect "s_mov_b32 flat_scratch_lo, 0", "~{flat_scratch_lo}"()
   __asm__ volatile("s_mov_b32 flat_scratch_lo, 0" : : : "flat_scratch_lo");
 
 // CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_hi, 0", "~{flat_scratch_hi}"()
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -489,9 +489,9 @@
 }
 
 // CHECK-LABEL: @test_get_group_id(
-// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.x()
-// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.y()
-// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.z()
+// CHECK: call i32 @llvm.amdgcn.workgroup.id.x()
+// CHECK: call i32 @llvm.amdgcn.workgroup.id.y()
+// CHECK: call i32 @llvm.amdgcn.workgroup.id.z()
 void test_get_group_id(int d, global int *out)
 {
 	switch (d) {
@@ -503,9 +503,9 @@
 }
 
 // CHECK-LABEL: @test_s_getreg(
-// CHECK: tail call i32 @llvm.amdgcn.s.getreg(i32 0)
-// CHECK: tail call i32 @llvm.amdgcn.s.getreg(i32 1)
-// CHECK: tail call i32 @llvm.amdgcn.s.getreg(i32 65535)
+// CHECK: call i32 @llvm.amdgcn.s.getreg(i32 0)
+// CHECK: call i32 @llvm.amdgcn.s.getreg(i32 1)
+// CHECK: call i32 @llvm.amdgcn.s.getreg(i32 65535)
 void test_s_getreg(volatile global uint *out)
 {
   *out = __builtin_amdgcn_s_getreg(0);
@@ -514,9 +514,9 @@
 }
 
 // CHECK-LABEL: @test_get_local_id(
-// CHECK: tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[$WI_RANGE:![0-9]*]]
-// CHECK: tail call i32 @llvm.amdgcn.workitem.id.y(), !range [[$WI_RANGE]]
-// CHECK: tail call i32 @llvm.amdgcn.workitem.id.z(), !range [[$WI_RANGE]]
+// CHECK: call i32 @llvm.amdgcn.workitem.id.x(), !range [[$WI_RANGE:![0-9]*]]
+// CHECK: call i32 @llvm.amdgcn.workitem.id.y(), !range [[$WI_RANGE]]
+// CHECK: call i32 @llvm.amdgcn.workitem.id.z(), !range [[$WI_RANGE]]
 void test_get_local_id(int d, global int *out)
 {
 	switch (d) {
@@ -614,67 +614,67 @@
 }
 
 // CHECK-LABEL: @test_alignbit(
-// CHECK: tail call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
+// CHECK: call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
 kernel void test_alignbit(global uint* out, uint src0, uint src1, uint src2) {
   *out = __builtin_amdgcn_alignbit(src0, src1, src2);
 }
 
 // CHECK-LABEL: @test_alignbyte(
-// CHECK: tail call i32 @llvm.amdgcn.alignbyte(i32 %src0, i32 %src1, i32 %src2)
+// CHECK: call i32 @llvm.amdgcn.alignbyte(i32 %src0, i32 %src1, i32 %src2)
 kernel void test_alignbyte(global uint* out, uint src0, uint src1, uint src2) {
   *out = __builtin_amdgcn_alignbyte(src0, src1, src2);
 }
 
 // CHECK-LABEL: @test_ubfe(
-// CHECK: tail call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src2)
+// CHECK: call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src2)
 kernel void test_ubfe(global uint* out, uint src0, uint src1, uint src2) {
   *out = __builtin_amdgcn_ubfe(src0, src1, src2);
 }
 
 // CHECK-LABEL: @test_sbfe(
-// CHECK: tail call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2)
+// CHECK: call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2)
 kernel void test_sbfe(global uint* out, uint src0, uint src1, uint src2) {
   *out = __builtin_amdgcn_sbfe(src0, src1, src2);
 }
 
 // CHECK-LABEL: @test_cvt_pkrtz(
-// CHECK: tail call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %src0, float %src1)
+// CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %src0, float %src1)
 kernel void test_cvt_pkrtz(global half2* out, float src0, float src1) {
   *out = __builtin_amdgcn_cvt_pkrtz(src0, src1);
 }
 
 // CHECK-LABEL: @test_cvt_pknorm_i16(
-// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1)
+// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1)
 kernel void test_cvt_pknorm_i16(global short2* out, float src0, float src1) {
   *out = __builtin_amdgcn_cvt_pknorm_i16(src0, src1);
 }
 
 // CHECK-LABEL: @test_cvt_pknorm_u16(
-// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1)
+// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1)
 kernel void test_cvt_pknorm_u16(global ushort2* out, float src0, float src1) {
   *out = __builtin_amdgcn_cvt_pknorm_u16(src0, src1);
 }
 
 // CHECK-LABEL: @test_cvt_pk_i16(
-// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %src0, i32 %src1)
+// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %src0, i32 %src1)
 kernel void test_cvt_pk_i16(global short2* out, int src0, int src1) {
   *out = __builtin_amdgcn_cvt_pk_i16(src0, src1);
 }
 
 // CHECK-LABEL: @test_cvt_pk_u16(
-// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %src0, i32 %src1)
+// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %src0, i32 %src1)
 kernel void test_cvt_pk_u16(global ushort2* out, uint src0, uint src1) {
   *out = __builtin_amdgcn_cvt_pk_u16(src0, src1);
 }
 
 // CHECK-LABEL: @test_cvt_pk_u8_f32
-// CHECK: tail call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src0, i32 %src1, i32 %src2)
+// CHECK: call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src0, i32 %src1, i32 %src2)
 kernel void test_cvt_pk_u8_f32(global uint* out, float src0, uint src1, uint src2) {
   *out = __builtin_amdgcn_cvt_pk_u8_f32(src0, src1, src2);
 }
 
 // CHECK-LABEL: @test_sad_u8(
-// CHECK: tail call i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2)
+// CHECK: call i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2)
 kernel void test_sad_u8(global uint* out, uint src0, uint src1, uint src2) {
   *out = __builtin_amdgcn_sad_u8(src0, src1, src2);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
--- a/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
@@ -2,20 +2,20 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: @test_builtin_clz(
-// CHECK: tail call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+// CHECK: call i32 @llvm.ctlz.i32(i32 %a, i1 true)
 void test_builtin_clz(global int* out, int a)
 {
   *out = __builtin_clz(a);
 }
 
 // CHECK-LABEL: @test_builtin_clzl(
-// CHECK: tail call i64 @llvm.ctlz.i64(i64 %a, i1 true)
+// CHECK: call i64 @llvm.ctlz.i64(i64 %a, i1 true)
 void test_builtin_clzl(global long* out, long a)
 {
   *out = __builtin_clzl(a);
 }
 
-// CHECK: tail call i8 addrspace(5)* @llvm.frameaddress.p5i8(i32 0)
+// CHECK: call i8 addrspace(5)* @llvm.frameaddress.p5i8(i32 0)
 void test_builtin_frame_address(int *out) {
     *out = __builtin_frame_address(0);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-r600.cl b/clang/test/CodeGenOpenCL/builtins-r600.cl
--- a/clang/test/CodeGenOpenCL/builtins-r600.cl
+++ b/clang/test/CodeGenOpenCL/builtins-r600.cl
@@ -25,9 +25,9 @@
 }
 
 // CHECK-LABEL: @test_get_group_id(
-// CHECK: tail call i32 @llvm.r600.read.tgid.x()
-// CHECK: tail call i32 @llvm.r600.read.tgid.y()
-// CHECK: tail call i32 @llvm.r600.read.tgid.z()
+// CHECK: call i32 @llvm.r600.read.tgid.x()
+// CHECK: call i32 @llvm.r600.read.tgid.y()
+// CHECK: call i32 @llvm.r600.read.tgid.z()
 void test_get_group_id(int d, global int *out)
 {
 	switch (d) {
@@ -39,9 +39,9 @@
 }
 
 // CHECK-LABEL: @test_get_local_id(
-// CHECK: tail call i32 @llvm.r600.read.tidig.x(), !range [[WI_RANGE:![0-9]*]]
-// CHECK: tail call i32 @llvm.r600.read.tidig.y(), !range [[WI_RANGE]]
-// CHECK: tail call i32 @llvm.r600.read.tidig.z(), !range [[WI_RANGE]]
+// CHECK: call i32 @llvm.r600.read.tidig.x(), !range [[WI_RANGE:![0-9]*]]
+// CHECK: call i32 @llvm.r600.read.tidig.y(), !range [[WI_RANGE]]
+// CHECK: call i32 @llvm.r600.read.tidig.z(), !range [[WI_RANGE]]
 void test_get_local_id(int d, global int *out)
 {
 	switch (d) {
diff --git a/clang/test/CodeGenOpenCL/convergent.cl b/clang/test/CodeGenOpenCL/convergent.cl
--- a/clang/test/CodeGenOpenCL/convergent.cl
+++ b/clang/test/CodeGenOpenCL/convergent.cl
@@ -33,14 +33,14 @@
 // CHECK: br i1 %[[tobool]], label %[[if_end3_critedge:.+]], label %[[if_then:.+]]
 
 // CHECK: [[if_then]]:
-// CHECK: tail call spir_func void @f()
-// CHECK: tail call spir_func void @non_convfun()
+// CHECK: call spir_func void @f()
+// CHECK: call spir_func void @non_convfun()
 // CHECK: tail call spir_func void @g()
 
 // CHECK: br label %[[if_end3:.+]]
 
 // CHECK: [[if_end3_critedge]]:
-// CHECK: tail call spir_func void @non_convfun()
+// CHECK: call spir_func void @non_convfun()
 // CHECK: br label %[[if_end3]]
 
 // CHECK: [[if_end3]]:
@@ -65,13 +65,13 @@
 // CHECK:  %[[tobool:.+]] = icmp eq i32 %a, 0
 // CHECK: br i1 %[[tobool]], label %[[if_end:.+]], label %[[if_then:.+]]
 // CHECK: [[if_then]]:
-// CHECK: tail call spir_func void @f()
+// CHECK: call spir_func void @f()
 // CHECK-NOT: call spir_func void @convfun()
 // CHECK-NOT: call spir_func void @g()
 // CHECK: br label %[[if_end]]
 // CHECK: [[if_end]]:
 // CHECK:  %[[tobool_pr:.+]] = phi i1 [ true, %[[if_then]] ], [ false, %{{.+}} ]
-// CHECK:  tail call spir_func void @convfun() #[[attr4:.+]]
+// CHECK:  call spir_func void @convfun() #[[attr4:.+]]
 // CHECK:  br i1 %[[tobool_pr]], label %[[if_then2:.+]], label %[[if_end3:.+]]
 // CHECK: [[if_then2]]:
 // CHECK: tail call spir_func void @g()
@@ -93,16 +93,16 @@
 
 // Test loop is unrolled for convergent function.
 // CHECK-LABEL: define spir_func void @test_unroll() local_unnamed_addr #1
-// CHECK:  tail call spir_func void @convfun() #[[attr4:[0-9]+]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
-// CHECK:  tail call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4:[0-9]+]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
+// CHECK:  call spir_func void @convfun() #[[attr4]]
 // CHECK-LABEL:  ret void
 
 void test_unroll() {
@@ -116,7 +116,7 @@
 // CHECK: [[for_cond_cleanup:.+]]:
 // CHECK:  ret void
 // CHECK: [[for_body]]:
-// CHECK:  tail call spir_func void @nodupfun() #[[attr5:[0-9]+]]
+// CHECK:  call spir_func void @nodupfun() #[[attr5:[0-9]+]]
 // CHECK-NOT: call spir_func void @nodupfun()
 
 // The new PM produces a slightly different IR for the loop from the legacy PM,
diff --git a/clang/test/CodeGenOpenCL/spir-calling-conv.cl b/clang/test/CodeGenOpenCL/spir-calling-conv.cl
--- a/clang/test/CodeGenOpenCL/spir-calling-conv.cl
+++ b/clang/test/CodeGenOpenCL/spir-calling-conv.cl
@@ -8,7 +8,7 @@
 // CHECK: define spir_kernel void @foo(i32 addrspace(1)* %A)
 {
   int id = get_dummy_id(0);
-  // CHECK: %{{[a-z0-9_]+}} = tail call spir_func i32 @get_dummy_id(i32 0)
+  // CHECK: %{{[a-z0-9_]+}} = call spir_func i32 @get_dummy_id(i32 0)
   A[id] = id;
   bar(A);
   // CHECK: tail call spir_kernel void @bar(i32 addrspace(1)* %A)
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -89,16 +89,6 @@
 STATISTIC(NumRetDuped,   "Number of return duplicated");
 STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 
-/// Scan the specified function for alloca instructions.
-/// If it contains any dynamic allocas, returns false.
-static bool canTRE(Function &F) {
-  // Because of PR962, we don't TRE dynamic allocas.
-  return llvm::all_of(instructions(F), [](Instruction &I) {
-    auto *AI = dyn_cast<AllocaInst>(&I);
-    return !AI || AI->isStaticAlloca();
-  });
-}
-
 namespace {
 struct AllocaDerivedValueTracker {
   // Start at a root value and walk its use-def chain to mark calls that use the
@@ -185,11 +175,144 @@
 };
 }
 
-static bool markTails(Function &F, bool &AllCallsAreTailCalls,
-                      OptimizationRemarkEmitter *ORE) {
+// Debug info intrinsics, lifetime end or assume intrinsic
+// should not stop tail call optimization.
+static bool canBeIgnoredForTailCall(Instruction *I) {
+  if (isa<DbgInfoIntrinsic>(I))
+    return true;
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
+        II->getIntrinsicID() == Intrinsic::assume)
+      return true;
+
+  return false;
+}
+
+/// Returns true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
+  if (canBeIgnoredForTailCall(I))
+    return true;
+
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayHaveSideEffects()) // This also handles volatile loads.
+    return false;
+
+  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+    // Loads may always be moved above calls without side effects.
+    if (CI->mayHaveSideEffects()) {
+      // Non-volatile loads may be moved above a call with side effects if it
+      // does not write to memory and the load provably won't trap.
+      // Writes to memory only matter if they may alias the pointer
+      // being loaded from.
+      const DataLayout &DL = L->getModule()->getDataLayout();
+      if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
+          !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
+                                       L->getAlign(), DL, L))
+        return false;
+    }
+  }
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  return !is_contained(I->operands(), CI);
+}
+
+/// Returns true if it is noopt bitcast.
+/// (i.e. it casts pointer to pointer).
+static bool isNoopBitcast(Instruction *I) {
+  if (BitCastInst *BitCast = dyn_cast<BitCastInst>(I)) {
+    Type *T1 = BitCast->getType();
+    Type *T2 = BitCast->getOperand(0)->getType();
+
+    return T1 == T2 || (T1->isPointerTy() && T2->isPointerTy());
+  }
+
+  return false;
+}
+
+/// Returns true if this is a block with "return", "unreachable"
+/// or "unconditional branch" to other block with "return"
+/// instruction. RetValue keeps return value if applicable.
+static bool isTailBlock(BasicBlock *BB, Value*& RetValue) {
+  Instruction *LastBlockInstr = BB->getTerminator();
+  RetValue = nullptr;
+
+  // Check that last instruction is a "return", either "unreachable",
+  // either branch to other block containing "return".
+  if (ReturnInst* Ret = dyn_cast<ReturnInst>(LastBlockInstr)) {
+    RetValue = Ret->getReturnValue();
+    return true;
+  }
+
+  if (isa<UnreachableInst>(LastBlockInstr))
+    return true;
+
+  if (BranchInst *Branch = dyn_cast<BranchInst>(LastBlockInstr)) {
+    if (Branch->isUnconditional())
+      if(ReturnInst* Ret = dyn_cast<ReturnInst>(Branch->getSuccessor(0)->getFirstNonPHIOrDbg())) {
+        RetValue = Ret->getReturnValue();
+        return true;
+      }
+  }
+
+  return false;
+}
+
+/// Checks that specified call instruction is in chain of recursive
+/// calls before return.
+static bool areAllLastFuncCallRecursive(CallInst *Inst, Function &F) {
+  BasicBlock::iterator BBI(Inst->getParent()->getTerminator());
+  for (--BBI; &*BBI != Inst; --BBI) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*BBI))
+      if (!canBeIgnoredForTailCall(CI) && CI->getCalledFunction() != &F)
+        return false;
+  }
+
+  return true;
+}
+
+/// Checks that call position is suitable for tailcall optimization.
+static bool isInTailCallPosition(CallInst *TailCallCandidate,
+                                 AliasAnalysis *AA) {
+  Value* RetValue = nullptr;
+  if (!isTailBlock(TailCallCandidate->getParent(), RetValue))
+    return false;
+
+  // Check that instructions chain between call and return
+  // has only noops or instructions which could be moved.
+  BasicBlock::iterator BBI(TailCallCandidate->getParent()->getTerminator());
+  for (--BBI; &*BBI != TailCallCandidate; --BBI) {
+    if (canMoveAboveCall(&*BBI, TailCallCandidate, AA))
+      continue;
+    else if (isNoopBitcast(&*BBI))
+      continue;
+    else
+      return false;
+  }
+
+  // Check that return value matches with TailCallCandidate or void.
+  if (RetValue != nullptr && RetValue != TailCallCandidate) {
+    if (BitCastInst *BitCast = dyn_cast<BitCastInst>(RetValue))
+      return (isNoopBitcast(BitCast) && BitCast->getOperand(0) == TailCallCandidate);
+
+    // Return operand does not match with tailcall candidate.
+    return false;
+  }
+
+  return true;
+}
+
+static bool markTails(Function &F, OptimizationRemarkEmitter *ORE,
+                      AliasAnalysis *AA) {
   if (F.callsFunctionThatReturnsTwice())
     return false;
-  AllCallsAreTailCalls = true;
 
   // The local stack holds all alloca instructions and all byval arguments.
   AllocaDerivedValueTracker Tracker;
@@ -237,7 +360,8 @@
         Escaped = ESCAPED;
 
       CallInst *CI = dyn_cast<CallInst>(&I);
-      if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I))
+
+      if (!CI || CI->isTailCall() || !isInTailCallPosition(CI, AA))
         continue;
 
       bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
@@ -272,11 +396,8 @@
         }
       }
 
-      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI))
         DeferredTails.push_back(CI);
-      } else {
-        AllCallsAreTailCalls = false;
-      }
     }
 
     for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
@@ -313,47 +434,12 @@
       LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
       CI->setTailCall();
       Modified = true;
-    } else {
-      AllCallsAreTailCalls = false;
     }
   }
 
   return Modified;
 }
 
-/// Return true if it is safe to move the specified
-/// instruction from after the call to before the call, assuming that all
-/// instructions between the call and this instruction are movable.
-///
-static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
-  // FIXME: We can move load/store/call/free instructions above the call if the
-  // call does not mod/ref the memory location being processed.
-  if (I->mayHaveSideEffects())  // This also handles volatile loads.
-    return false;
-
-  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
-    // Loads may always be moved above calls without side effects.
-    if (CI->mayHaveSideEffects()) {
-      // Non-volatile loads may be moved above a call with side effects if it
-      // does not write to memory and the load provably won't trap.
-      // Writes to memory only matter if they may alias the pointer
-      // being loaded from.
-      const DataLayout &DL = L->getModule()->getDataLayout();
-      if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
-          !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
-                                       L->getAlign(), DL, L))
-        return false;
-    }
-  }
-
-  // Otherwise, if this is a side-effect free instruction, check to make sure
-  // that it does not use the return value of the call.  If it doesn't use the
-  // return value of the call, it must only use things that are defined before
-  // the call, or movable instructions between the call and the instruction
-  // itself.
-  return !is_contained(I->operands(), CI);
-}
-
 static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
   if (!I->isAssociative() || !I->isCommutative())
     return false;
@@ -392,7 +478,6 @@
   // createTailRecurseLoopHeader the first time we find a call we can eliminate.
   BasicBlock *HeaderBB = nullptr;
   SmallVector<PHINode *, 8> ArgumentPHIs;
-  bool RemovableCallsMustBeMarkedTail = false;
 
   // PHI node to store our return value.
   PHINode *RetPN = nullptr;
@@ -419,8 +504,7 @@
                           DomTreeUpdater &DTU)
       : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
 
-  CallInst *findTRECandidate(Instruction *TI,
-                             bool CannotTailCallElimCallsMarkedTail);
+  CallInst *findTRECandidate(Instruction *TI);
 
   void createTailRecurseLoopHeader(CallInst *CI);
 
@@ -428,14 +512,14 @@
 
   bool eliminateCall(CallInst *CI);
 
-  bool foldReturnAndProcessPred(ReturnInst *Ret,
-                                bool CannotTailCallElimCallsMarkedTail);
+  bool foldReturnAndProcessPred(ReturnInst *Ret);
 
-  bool processReturningBlock(ReturnInst *Ret,
-                             bool CannotTailCallElimCallsMarkedTail);
+  bool processReturningBlock(ReturnInst *Ret);
 
   void cleanupAndFinalize();
 
+  bool canTRE(Function &F);
+
 public:
   static bool eliminate(Function &F, const TargetTransformInfo *TTI,
                         AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
@@ -443,8 +527,7 @@
 };
 } // namespace
 
-CallInst *TailRecursionEliminator::findTRECandidate(
-    Instruction *TI, bool CannotTailCallElimCallsMarkedTail) {
+CallInst *TailRecursionEliminator::findTRECandidate(Instruction *TI) {
   BasicBlock *BB = TI->getParent();
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
@@ -464,11 +547,6 @@
     --BBI;
   }
 
-  // If this call is marked as a tail call, and if there are dynamic allocas in
-  // the function, we cannot perform this optimization.
-  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
-    return nullptr;
-
   // As a special case, detect code like this:
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
   // and disable this xform in this case, because the code generator will
@@ -498,26 +576,15 @@
   BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
   BI->setDebugLoc(CI->getDebugLoc());
 
-  // If this function has self recursive calls in the tail position where some
-  // are marked tail and some are not, only transform one flavor or another.
-  // We have to choose whether we move allocas in the entry block to the new
-  // entry block or not, so we can't make a good choice for both. We make this
-  // decision here based on whether the first call we found to remove is
-  // marked tail.
-  // NOTE: We could do slightly better here in the case that the function has
-  // no entry block allocas.
-  RemovableCallsMustBeMarkedTail = CI->isTailCall();
-
-  // If this tail call is marked 'tail' and if there are any allocas in the
+  // if there are any allocas in the
   // entry block, move them up to the new entry block.
-  if (RemovableCallsMustBeMarkedTail)
-    // Move all fixed sized allocas from HeaderBB to NewEntry.
-    for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
-                              NEBI = NewEntry->begin();
-         OEBI != E;)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
-        if (isa<ConstantInt>(AI->getArraySize()))
-          AI->moveBefore(&*NEBI);
+  // Move all fixed sized allocas from HeaderBB to NewEntry.
+  for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
+                            NEBI = NewEntry->begin();
+       OEBI != E;)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+      if (isa<ConstantInt>(AI->getArraySize()))
+        AI->moveBefore(&*NEBI);
 
   // Now that we have created a new block, which jumps to the entry
   // block, insert a PHI node for each argument of the function.
@@ -620,9 +687,6 @@
   if (!HeaderBB)
     createTailRecurseLoopHeader(CI);
 
-  if (RemovableCallsMustBeMarkedTail && !CI->isTailCall())
-    return false;
-
   // Ok, now that we know we have a pseudo-entry block WITH all of the
   // required PHI nodes, add entries into the PHI node for the actual
   // parameters passed into the tail-recursive call.
@@ -672,8 +736,7 @@
   return true;
 }
 
-bool TailRecursionEliminator::foldReturnAndProcessPred(
-    ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
+bool TailRecursionEliminator::foldReturnAndProcessPred(ReturnInst *Ret) {
   BasicBlock *BB = Ret->getParent();
 
   bool Change = false;
@@ -698,8 +761,7 @@
   while (!UncondBranchPreds.empty()) {
     BranchInst *BI = UncondBranchPreds.pop_back_val();
     BasicBlock *Pred = BI->getParent();
-    if (CallInst *CI =
-            findTRECandidate(BI, CannotTailCallElimCallsMarkedTail)) {
+    if (CallInst *CI = findTRECandidate(BI)) {
       LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
                         << "INTO UNCOND BRANCH PRED: " << *Pred);
       FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);
@@ -720,9 +782,8 @@
   return Change;
 }
 
-bool TailRecursionEliminator::processReturningBlock(
-    ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
-  CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
+bool TailRecursionEliminator::processReturningBlock(ReturnInst *Ret) {
+  CallInst *CI = findTRECandidate(Ret);
   if (!CI)
     return false;
 
@@ -810,23 +871,18 @@
     return false;
 
   bool MadeChange = false;
-  bool AllCallsAreTailCalls = false;
-  MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
-  if (!AllCallsAreTailCalls)
-    return MadeChange;
+  MadeChange |= markTails(F, ORE, AA);
 
   // If this function is a varargs function, we won't be able to PHI the args
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg())
     return MadeChange;
 
-  // If false, we cannot perform TRE on tail calls marked with the 'tail'
-  // attribute, because doing so would cause the stack size to increase (real
-  // TRE would deallocate variable sized allocas, TRE doesn't).
-  bool CanTRETailMarkedCall = canTRE(F);
-
   TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
 
+  if (!TRE.canTRE(F))
+    return MadeChange;
+
   // Change any tail recursive calls to loops.
   //
   // FIXME: The code generator produces really bad code when an 'escaping
@@ -836,9 +892,9 @@
   for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
     BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      bool Change = TRE.processReturningBlock(Ret, !CanTRETailMarkedCall);
+      bool Change = TRE.processReturningBlock(Ret);
       if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = TRE.foldReturnAndProcessPred(Ret, !CanTRETailMarkedCall);
+        Change = TRE.foldReturnAndProcessPred(Ret);
       MadeChange |= Change;
     }
   }
@@ -848,6 +904,51 @@
   return MadeChange;
 }
 
+bool TailRecursionEliminator::canTRE(Function &F) {
+  // The local stack holds all alloca instructions and all byval arguments.
+  AllocaDerivedValueTracker Tracker;
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr())
+      Tracker.walk(&Arg);
+  }
+  for (auto &BB : F) {
+    for (auto &I : BB)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+        Tracker.walk(AI);
+  }
+
+  // do not do TRE if any pointer to local stack has escaped.
+  if (!Tracker.EscapePoints.empty())
+    return false;
+
+  return !llvm::any_of(instructions(F), [&](Instruction &I) {
+    // Because of PR962, we don't TRE dynamic allocas.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+      if (AI && !AI->isStaticAlloca())
+        return true;
+    } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (CI->getCalledFunction() == &F) {
+        // Do not do TRE if explicitly marked as NoTailcall or has Operand
+        // Bundles.
+        if (CI->isNoTailCall() || CI->hasOperandBundles())
+          return true;
+
+        // Do not do TRE if exists recursive calls which are not last calls.
+        Value* RetValue = nullptr;
+        if (!isTailBlock(CI->getParent(), RetValue) ||
+            !areAllLastFuncCallRecursive(CI, F))
+          return true;
+
+        // Do not do TRE if recursive call receives pointer to the local stack.
+        if (Tracker.AllocaUsers.count(CI) > 0)
+          return true;
+      }
+    }
+
+    return false;
+  });
+}
+
 namespace {
 struct TailCallElim : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
@@ -55,22 +55,22 @@
 }
 
 ; GCN: define amdgpu_kernel void @test_inliner(
-; GCN-INL1:   %c1 = tail call coldcc float @foo(
+; GCN-INL1:   %c1 = call coldcc float @foo(
 ; GCN-INLDEF: %cmp.i = fcmp ogt float %tmp2, 0.000000e+00
 ; GCN:        %div.i{{[0-9]*}} = fdiv float 1.000000e+00, %c
 ; GCN:        %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i
 ; GCN:        call void @foo_noinline(
-; GCN:        tail call float @_Z3sinf(
+; GCN:        call float @_Z3sinf(
 define amdgpu_kernel void @test_inliner(float addrspace(1)* nocapture %a, i32 %n) {
 entry:
   %pvt_arr = alloca [64 x float], align 4, addrspace(5)
-  %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid
   %tmp2 = load float, float addrspace(1)* %arrayidx, align 4
   %add = add i32 %tid, 1
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %a, i32 %add
   %tmp5 = load float, float addrspace(1)* %arrayidx2, align 4
-  %c1 = tail call coldcc float @foo(float %tmp2, float %tmp5)
+  %c1 = call coldcc float @foo(float %tmp2, float %tmp5)
   %or = or i32 %tid, %n
   %arrayidx5 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %or
   store float %c1, float addrspace(5)* %arrayidx5, align 4
@@ -98,7 +98,7 @@
 entry:
   %pvt_arr1 = alloca [32 x float], align 4, addrspace(5)
   %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
-  %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid
   %or = or i32 %tid, %n
   %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or
@@ -125,7 +125,7 @@
 entry:
   %pvt_arr1 = alloca [32 x float], align 4, addrspace(5)
   %pvt_arr2 = alloca [33 x float], align 4, addrspace(5)
-  %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid
   %or = or i32 %tid, %n
   %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-transforms.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-transforms.ll
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-transforms.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-transforms.ll
@@ -55,8 +55,8 @@
   %call3 = call i32 @get_value(i32* %9), !dbg !26
   br label %cond.end5, !dbg !24
 
-; CHECK: tail call i32* @llvm.preserve.array.access.index.p0i32.p0i32(i32* %{{[0-9a-z]+}}, i32 0, i32 4), !dbg !{{[0-9]+}}, !llvm.preserve.access.index !{{[0-9]+}}
-; CHECK-NOT: tail call i32* @llvm.preserve.array.access.index
+; CHECK: call i32* @llvm.preserve.array.access.index.p0i32.p0i32(i32* %{{[0-9a-z]+}}, i32 0, i32 4), !dbg !{{[0-9]+}}, !llvm.preserve.access.index !{{[0-9]+}}
+; CHECK-NOT: call i32* @llvm.preserve.array.access.index
 
 cond.false4:                                      ; preds = %cond.end
   br label %cond.end5, !dbg !24
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll b/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll
--- a/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* %buffer, i64 8
 ; CHECK-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to i32*
 ; CHECK-NEXT:    store i32 %n, i32* [[T1]], align 4
-; CHECK-NEXT:    [[ALLOC:%.*]] = tail call i8* @allocate(i32 %n)
+; CHECK-NEXT:    [[ALLOC:%.*]] = call i8* @allocate(i32 %n)
 ; CHECK-NEXT:    [[T1:%.*]] = bitcast i8* %buffer to i8**
 ; CHECK-NEXT:    store i8* [[ALLOC]], i8** [[T1]], align 8
 ; CHECK-NEXT:    [[T0:%.*]] = insertvalue { i8*, i8*, i32 } { i8* bitcast ({ i8*, i8*, i32 } (i8*, i1)* @f.resume.0 to i8*), i8* undef, i32 undef }, i8* [[ALLOC]], 1
@@ -43,7 +43,7 @@
 ; CHECK-NEXT:  :
 ; CHECK-NEXT:    [[T1:%.*]] = bitcast i8* %0 to i8**
 ; CHECK-NEXT:    [[ALLOC:%.*]] = load i8*, i8** [[T1]], align 8
-; CHECK-NEXT:    tail call void @deallocate(i8* [[ALLOC]])
+; CHECK-NEXT:    call void @deallocate(i8* [[ALLOC]])
 ; CHECK-NEXT:    br i1 %1,
 
 declare {i8*, i32} @prototype_g(i8*, i1)
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -331,16 +331,16 @@
 ; CHECK-LABEL: @fmin_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, !tbaa !7
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP0]], float 0x47EFFFFFE0000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.minnum.f32(float [[TMP0]], float 0x47EFFFFFE0000000)
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, !tbaa !7
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP2]], float [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.minnum.f32(float [[TMP2]], float [[TMP1]])
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, !tbaa !7
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP4]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.minnum.f32(float [[TMP4]], float [[TMP3]])
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, !tbaa !7
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP6]], float [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.minnum.f32(float [[TMP6]], float [[TMP5]])
 ; CHECK-NEXT:    ret float [[TMP7]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
--- a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
@@ -44,7 +44,7 @@
 ; ASSUMPTIONS-ON-NEXT:    [[PTRINT:%.*]] = ptrtoint i64* [[PTR:%.*]] to i64
 ; ASSUMPTIONS-ON-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 7
 ; ASSUMPTIONS-ON-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-; ASSUMPTIONS-ON-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; ASSUMPTIONS-ON-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 0, i64* [[PTR]], align 8
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
diff --git a/llvm/test/Transforms/TailCallElim/accum_recursion.ll b/llvm/test/Transforms/TailCallElim/accum_recursion.ll
--- a/llvm/test/Transforms/TailCallElim/accum_recursion.ll
+++ b/llvm/test/Transforms/TailCallElim/accum_recursion.ll
@@ -59,9 +59,9 @@
 
 bb1:
   %0 = add i64 %n, -1
-  %recurse1 = tail call i64 @test3_fib(i64 %0) nounwind
+  %recurse1 = call i64 @test3_fib(i64 %0) nounwind
   %1 = add i64 %n, -2
-  %recurse2 = tail call i64 @test3_fib(i64 %1) nounwind
+  %recurse2 = call i64 @test3_fib(i64 %1) nounwind
   %accumulate = add nsw i64 %recurse2, %recurse1
   ret i64 %accumulate
 
diff --git a/llvm/test/Transforms/TailCallElim/basic.ll b/llvm/test/Transforms/TailCallElim/basic.ll
--- a/llvm/test/Transforms/TailCallElim/basic.ll
+++ b/llvm/test/Transforms/TailCallElim/basic.ll
@@ -211,13 +211,17 @@
 
 ; If an alloca is passed byval it is not a use of the alloca or an escape
 ; point, and both calls below can be marked tail.
-define void @test13() {
+define void @test13(i1 %cond) {
 ; CHECK-LABEL: @test13
 ; CHECK: tail call void @bar(%struct.foo* byval %f)
 ; CHECK: tail call void @bar(%struct.foo* null)
 entry:
   %f = alloca %struct.foo
+  br i1 %cond, label %cond_true, label %cond_false
+cond_true:
   call void @bar(%struct.foo* byval %f)
+  ret void
+cond_false:
   call void @bar(%struct.foo* null)
   ret void
 }
@@ -245,5 +249,19 @@
   ret void
 }
 
+; If one call followed by another call then the first one
+; should not be marked as a tailcall.
+define void @test16(i32 %X) {
+; CHECK-NOT: tail call void @noarg()
+; CHECK: add i32 %X
+; CHECK-NEXT: tail call void @noarg()
+; CHECK-NEXT: ret
+        call void @noarg()
+	%DUMMY = add i32 %X, 1
+	call void @noarg()
+        ret void
+}
+
+
 declare void @bar(%struct.foo* byval)
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)