Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8934,6 +8934,37 @@
   }
 }
 
+bool VPRecipeBuilder::isUniformIntrinsicCall(CallInst *CI,
+                                             bool IsScalable) const {
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  switch (ID) {
+  case Intrinsic::sideeffect:
+  case Intrinsic::experimental_noalias_scope_decl:
+    return true;
+  case Intrinsic::assume:
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+    // For scalable vectors if one of the operands is variant then we still
+    // want to mark as uniform, which will generate one instruction for just
+    // the first lane of the vector. We can't scalarize the call in the same
+    // way as for fixed-width vectors because we don't know how many lanes
+    // there are.
+    //
+    // The reasons for doing it this way for scalable vectors are:
+    //   1. For the assume intrinsic generating the instruction for the first
+    //      lane may still be better than not generating any at all. For
+    //      example, the input may be a splat across all lanes.
+    //   2. For the lifetime start/end intrinsics the pointer operand only
+    //      does anything useful when the input comes from an alloca, which
+    //      suggests it should always be uniform. In the case when it is not
+    //      from an alloca, then the intrinsic would have no effect anyway.
+    return IsScalable || OrigLoop->hasLoopInvariantOperands(CI);
+  default:
+    break;
+  }
+  return false;
+}
+
 VPBasicBlock *VPRecipeBuilder::handleReplication(
     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
     VPlanPtr &Plan) {
@@ -8944,6 +8975,10 @@
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
 
+  if (!IsUniform && isa<CallInst>(I))
+    IsUniform =
+        isUniformIntrinsicCall(cast<CallInst>(I), Range.Start.isScalable());
+
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
   setRecipe(I, Recipe);
Index: llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -171,6 +171,10 @@
       Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
       VPlanPtr &Plan);
 
+  /// Returns true if \p CI should be treated as a uniform instruction, which
+  /// also uses the \p IsScalable flag to determine the result.
+  bool isUniformIntrinsicCall(CallInst *CI, bool IsScalable) const;
+
   /// Add the incoming values from the backedge to reduction & first-order
   /// recurrence cross-iteration phis.
   void fixHeaderPhis();
Index: llvm/test/Transforms/LoopVectorize/assume.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/assume.ll
+++ llvm/test/Transforms/LoopVectorize/assume.ll
@@ -49,12 +49,8 @@
 ; CHECK:       vector.body:
 ; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK:       for.body:
 entry:
   %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
Index: llvm/test/Transforms/LoopVectorize/scalable-assume.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -0,0 +1,125 @@
+; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=2  -S | FileCheck %s
+
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
+; CHECK-LABEL: @test1(
+; CHECK:       vector.body:
+; CHECK:         [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1L0]])
+; CHECK-NEXT:    [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2L0]])
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 %cmp1)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind willreturn }
+
+%struct.data = type { float*, float* }
+
+define void @test2(%struct.data* nocapture readonly %d) {
+; CHECK-LABEL: @test2(
+; CHECK:       entry:
+; CHECK:         [[MASKCOND:%.*]] = icmp eq i64 %maskedptr, 0
+; CHECK:         [[MASKCOND4:%.*]] = icmp eq i64 %maskedptr3, 0
+; CHECK:       vector.body:
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND4]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
+entry:
+  %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
+  %0 = load float*, float** %b, align 8
+  %ptrint = ptrtoint float* %0 to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0
+  %1 = load float*, float** %a, align 8
+  %ptrint2 = ptrtoint float* %1 to i64
+  %maskedptr3 = and i64 %ptrint2, 31
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  br label %for.body
+
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
+  %add = fadd float %2, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond4)
+  %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Test case for PR43620. Make sure we can vectorize with predication in presence
+; of assume calls. For now, check that we drop all assumes in predicated blocks
+; in the vector body.
+define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) {
+; Check that the vector.body does not contain any assumes.
+; CHECK-LABEL: @predicated_assume(
+; CHECK:       vector.body:
+; CHECK-NOT:     llvm.assume
+; CHECK:       for.body:
+entry:
+  %cmp15 = icmp eq i32 %n, 0
+  br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end5
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end5
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end5 ]
+  %cmp1 = icmp ult i64 %indvars.iv, 495616
+  br i1 %cmp1, label %if.end5, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp2 = icmp ult i64 %indvars.iv, 991232
+  tail call void @llvm.assume(i1 %cmp2)
+  br label %if.end5
+
+if.end5:                                          ; preds = %for.body, %if.else
+  %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %x.0, %1
+  %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  store float %mul, float* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp eq i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
@@ -0,0 +1,81 @@
+; RUN: opt -S -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can vectorize loops which contain lifetime markers.
+
+define void @test(i32 *%d) {
+; CHECK-LABEL: @test(
+; CHECK:      entry:
+; CHECK:        [[ALLOCA:%.*]] = alloca [1024 x i32], align 16
+; CHECK-NEXT:   [[BC:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i8*
+; CHECK:      vector.body:
+; CHECK:        call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[BC]])
+; CHECK:        store <vscale x 2 x i32>
+; CHECK:        call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[BC]])
+
+entry:
+  %arr = alloca [1024 x i32], align 16
+  %0 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0
+
+for.end:
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  ret void
+}
+
+; CHECK-LABEL: @testloopvariant(
+; CHECK:      entry:
+; CHECK:        [[ALLOCA:%.*]] = alloca [1024 x i32], align 16
+; CHECK:      vector.ph:
+; CHECK:        [[TMP1:%.*]] = insertelement <vscale x 2 x [1024 x i32]*> poison, [1024 x i32]* %arr, i32 0
+; CHECK-NEXT:   [[SPLAT_ALLOCA:%.*]] = shufflevector <vscale x 2 x [1024 x i32]*> [[TMP1]], <vscale x 2 x [1024 x i32]*> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK:      vector.body:
+; CHECK:        [[BC_ALLOCA:%.*]] = bitcast <vscale x 2 x [1024 x i32]*> [[SPLAT_ALLOCA]] to <vscale x 2 x i8*>
+; CHECK-NEXT:   [[ONE_LIFETIME:%.*]] = extractelement <vscale x 2 x i8*> [[BC_ALLOCA]], i32 0
+; CHECK-NEXT:   call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[ONE_LIFETIME]])
+; CHECK:        store <vscale x 2 x i32>
+; CHECK-NEXT:   call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[ONE_LIFETIME]])
+
+define void @testloopvariant(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv
+  %1 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
@@ -0,0 +1,142 @@
+; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=4 -force-vector-interleave=2  -S | FileCheck %s
+
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @llvm.experimental.noalias.scope.decl(metadata)
+
+%struct.data = type { float*, float* }
+
+define void @test2(%struct.data* nocapture readonly %d) {
+entry:
+  %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
+  %0 = load float*, float** %b, align 8
+  %ptrint = ptrtoint float* %0 to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0
+  %1 = load float*, float** %a, align 8
+  %ptrint2 = ptrtoint float* %1 to i64
+  %maskedptr3 = and i64 %ptrint2, 31
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  br label %for.body
+
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
+  %add = fadd float %2, 1.000000e+00
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !4)
+  %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) {
+
+; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl
+
+; CHECK-LABEL: @predicated_noalias_scope_decl(
+; CHECK:   vector.body:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   scalar.ph:
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   if.else:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: }
+
+entry:
+  %cmp15 = icmp eq i32 %n, 0
+  br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end5
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end5
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end5 ]
+  %cmp1 = icmp ult i64 %indvars.iv, 495616
+  br i1 %cmp1, label %if.end5, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp2 = icmp ult i64 %indvars.iv, 991232
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  br label %if.end5
+
+if.end5:                                          ; preds = %for.body, %if.else
+  %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %x.0, %1
+  %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  store float %mul, float* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp eq i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !5
+}
+
+!0 = !{ !1 }
+!1 = distinct !{ !1, !2 }
+!2 = distinct !{ !2 }
+!3 = distinct !{ !3, !2 }
+!4 = !{ !3 }
+!5 = distinct !{!5, !6}
+!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]}
+; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]}
+; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]}
+; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]}
+; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}