Index: lib/Transforms/Scalar/LICM.cpp
===================================================================
--- lib/Transforms/Scalar/LICM.cpp
+++ lib/Transforms/Scalar/LICM.cpp
@@ -480,6 +480,60 @@
         SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
+// This function checks if the load is dominated by an invariant.start at the
+// same location, and return true for invariance. If we find an invariant.end,
+// we do not check whether load is within the scope of <invariant.start..
+// invariant.end>, we conservatively return false.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT) {
+  Value *LoadOp = LI->getOperand(0);
+  uint32_t MaxNumUses = 8;
+
+  // if the type is i8*, we return true since know that llvm.invariant.start
+  // uses i8* type values as operand.
+  auto isInvariantStartOperandType = [&]() {
+    if (cast<PointerType>(LoadOp->getType())->getElementType() ==
+        Type::getInt8Ty(LI->getContext()))
+      return true;
+
+    return false;
+  };
+
+  // Look through bitcasts until we reach the i8* type (this is invariant.start
+  // operand type).
+  while (isa<BitCastInst>(LoadOp)) {
+    if (isInvariantStartOperandType())
+      break;
+    LoadOp = cast<BitCastInst>(LoadOp)->getOperand(0);
+  }
+
+  bool isInvariant = false;
+  // Avoid traversing for LoadOperand with high number of users.
+  if (LoadOp->getNumUses() > MaxNumUses)
+    return false;
+
+  // We know that the LoadOp type should be i8*, to be used in invariant.start
+  // intrinsic. This is an early check.
+  // TODO: Consider cases where the LoadOp is bitcasted to i8*
+  if (!isInvariantStartOperandType())
+    return false;
+
+  // Traverse all uses of the load operand value, to see if invariant.start is
+  // one of the uses, and whether it dominates the load instruction.
+  for (auto *U : LoadOp->users()) {
+    if (!isa<IntrinsicInst>(U))
+      continue;
+    IntrinsicInst *II = cast<IntrinsicInst>(U);
+    if (II->getIntrinsicID() == Intrinsic::invariant_start &&
+        // If there are invariant.end instructions, the load maybe
+        // non-invariant. The only use of an invariant.start instruction is
+        // within its corresponding invariant.end instruction.
+        !II->getNumUses() && DT->dominates(II, LI))
+      isInvariant = true;
+  }
+
+  return isInvariant;
+}
+
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
                               LoopSafetyInfo *SafetyInfo,
@@ -496,6 +550,10 @@
     if (LI->getMetadata(LLVMContext::MD_invariant_load))
       return true;
 
+    // This checks for an invariant.start dominating the load.
+    if (isLoadInvariantInLoop(LI, DT))
+      return true;
+
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
     if (LI->getType()->isSized())
Index: test/Transforms/LICM/hoisting.ll
===================================================================
--- test/Transforms/LICM/hoisting.ll
+++ test/Transforms/LICM/hoisting.ll
@@ -149,3 +149,111 @@
 return:
   ret i32 %sum
 }
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
+; invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  ;br %cond, label %loop, label %loop.exit
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+
+
+; Same as test above, but the load is no longer invariant (presence of
+; invariant.end). We cannot hoist the addrld out of loop.
+define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence1
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NEXT: invariant.end
+; CHECK-NEXT: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; FIXME: invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+; Consider the loadoperand addr.i bitcasted before being passed to
+; invariant.start
+define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence3
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  ;br %cond, label %loop, label %loop.exit
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}