diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1041,6 +1041,7 @@
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
+  addPass(createSinkingPass());
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
@@ -1051,7 +1052,6 @@
     }
     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
   }
-  addPass(createSinkingPass());
   addPass(createAMDGPUAnnotateUniformValues());
   if (!LateCFGStructurize) {
     addPass(createSIAnnotateControlFlowPass());
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -77,6 +77,10 @@
 ; GCN-O0-NEXT:       Natural Loop Information
 ; GCN-O0-NEXT:       Legacy Divergence Analysis
 ; GCN-O0-NEXT:       AMDGPU IR late optimizations
+; GCN-O0-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O0-NEXT:       Function Alias Analysis Results
+; GCN-O0-NEXT:       Code sinking
+; GCN-O0-NEXT:       Legacy Divergence Analysis
 ; GCN-O0-NEXT:       Unify divergent function exit nodes
 ; GCN-O0-NEXT:       Lazy Value Information Analysis
 ; GCN-O0-NEXT:       Lower SwitchInst's to branches
@@ -89,12 +93,10 @@
 ; GCN-O0-NEXT:       Detect single entry single exit regions
 ; GCN-O0-NEXT:       Region Pass Manager
 ; GCN-O0-NEXT:         Structurize control flow
-; GCN-O0-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O0-NEXT:       Function Alias Analysis Results
-; GCN-O0-NEXT:       Natural Loop Information
-; GCN-O0-NEXT:       Code sinking
 ; GCN-O0-NEXT:       Post-Dominator Tree Construction
+; GCN-O0-NEXT:       Natural Loop Information
 ; GCN-O0-NEXT:       Legacy Divergence Analysis
+; GCN-O0-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O0-NEXT:       Function Alias Analysis Results
 ; GCN-O0-NEXT:       Memory SSA
 ; GCN-O0-NEXT:       AMDGPU Annotate Uniform Values
@@ -256,6 +258,10 @@
 ; GCN-O1-NEXT:       Natural Loop Information
 ; GCN-O1-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-NEXT:       AMDGPU IR late optimizations
+; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O1-NEXT:       Function Alias Analysis Results
+; GCN-O1-NEXT:       Code sinking
+; GCN-O1-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-NEXT:       Unify divergent function exit nodes
 ; GCN-O1-NEXT:       Lazy Value Information Analysis
 ; GCN-O1-NEXT:       Lower SwitchInst's to branches
@@ -268,12 +274,10 @@
 ; GCN-O1-NEXT:       Detect single entry single exit regions
 ; GCN-O1-NEXT:       Region Pass Manager
 ; GCN-O1-NEXT:         Structurize control flow
-; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:       Function Alias Analysis Results
-; GCN-O1-NEXT:       Natural Loop Information
-; GCN-O1-NEXT:       Code sinking
 ; GCN-O1-NEXT:       Post-Dominator Tree Construction
+; GCN-O1-NEXT:       Natural Loop Information
 ; GCN-O1-NEXT:       Legacy Divergence Analysis
+; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:       Function Alias Analysis Results
 ; GCN-O1-NEXT:       Memory SSA
 ; GCN-O1-NEXT:       AMDGPU Annotate Uniform Values
@@ -530,6 +534,10 @@
 ; GCN-O1-OPTS-NEXT:       Natural Loop Information
 ; GCN-O1-OPTS-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-OPTS-NEXT:       AMDGPU IR late optimizations
+; GCN-O1-OPTS-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O1-OPTS-NEXT:       Function Alias Analysis Results
+; GCN-O1-OPTS-NEXT:       Code sinking
+; GCN-O1-OPTS-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-OPTS-NEXT:       Unify divergent function exit nodes
 ; GCN-O1-OPTS-NEXT:       Lazy Value Information Analysis
 ; GCN-O1-OPTS-NEXT:       Lower SwitchInst's to branches
@@ -542,12 +550,10 @@
 ; GCN-O1-OPTS-NEXT:       Detect single entry single exit regions
 ; GCN-O1-OPTS-NEXT:       Region Pass Manager
 ; GCN-O1-OPTS-NEXT:         Structurize control flow
-; GCN-O1-OPTS-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O1-OPTS-NEXT:       Function Alias Analysis Results
-; GCN-O1-OPTS-NEXT:       Natural Loop Information
-; GCN-O1-OPTS-NEXT:       Code sinking
 ; GCN-O1-OPTS-NEXT:       Post-Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:       Natural Loop Information
 ; GCN-O1-OPTS-NEXT:       Legacy Divergence Analysis
+; GCN-O1-OPTS-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:       Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:       Memory SSA
 ; GCN-O1-OPTS-NEXT:       AMDGPU Annotate Uniform Values
@@ -812,6 +818,10 @@
 ; GCN-O2-NEXT:       Natural Loop Information
 ; GCN-O2-NEXT:       Legacy Divergence Analysis
 ; GCN-O2-NEXT:       AMDGPU IR late optimizations
+; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O2-NEXT:       Function Alias Analysis Results
+; GCN-O2-NEXT:       Code sinking
+; GCN-O2-NEXT:       Legacy Divergence Analysis
 ; GCN-O2-NEXT:       Unify divergent function exit nodes
 ; GCN-O2-NEXT:       Lazy Value Information Analysis
 ; GCN-O2-NEXT:       Lower SwitchInst's to branches
@@ -824,12 +834,10 @@
 ; GCN-O2-NEXT:       Detect single entry single exit regions
 ; GCN-O2-NEXT:       Region Pass Manager
 ; GCN-O2-NEXT:         Structurize control flow
-; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O2-NEXT:       Function Alias Analysis Results
-; GCN-O2-NEXT:       Natural Loop Information
-; GCN-O2-NEXT:       Code sinking
 ; GCN-O2-NEXT:       Post-Dominator Tree Construction
+; GCN-O2-NEXT:       Natural Loop Information
 ; GCN-O2-NEXT:       Legacy Divergence Analysis
+; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Memory SSA
 ; GCN-O2-NEXT:       AMDGPU Annotate Uniform Values
@@ -1107,6 +1115,10 @@
 ; GCN-O3-NEXT:       Natural Loop Information
 ; GCN-O3-NEXT:       Legacy Divergence Analysis
 ; GCN-O3-NEXT:       AMDGPU IR late optimizations
+; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O3-NEXT:       Function Alias Analysis Results
+; GCN-O3-NEXT:       Code sinking
+; GCN-O3-NEXT:       Legacy Divergence Analysis
 ; GCN-O3-NEXT:       Unify divergent function exit nodes
 ; GCN-O3-NEXT:       Lazy Value Information Analysis
 ; GCN-O3-NEXT:       Lower SwitchInst's to branches
@@ -1119,12 +1131,10 @@
 ; GCN-O3-NEXT:       Detect single entry single exit regions
 ; GCN-O3-NEXT:       Region Pass Manager
 ; GCN-O3-NEXT:         Structurize control flow
-; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O3-NEXT:       Function Alias Analysis Results
-; GCN-O3-NEXT:       Natural Loop Information
-; GCN-O3-NEXT:       Code sinking
 ; GCN-O3-NEXT:       Post-Dominator Tree Construction
+; GCN-O3-NEXT:       Natural Loop Information
 ; GCN-O3-NEXT:       Legacy Divergence Analysis
+; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Memory SSA
 ; GCN-O3-NEXT:       AMDGPU Annotate Uniform Values
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -6,11 +6,11 @@
 ; with exec.
 
 ; GCN-LABEL: {{^}}needs_and:
-; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}}
-; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]]
-; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]],
-; GCN: s_andn2_b64 exec, exec, [[REG3]]
 
+; GCN: s_or_b64 exec, exec, [[REG1:[^ ,]*]]
+; GCN: s_andn2_b64 exec, exec, [[REG2:[^ ,]*]]
+; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1:[^ ,]*]], [[REG2:[^ ,]*]]
+; GCN: s_or_b64 exec, exec, [[REG2:[^ ,]*]]
 define void @needs_and(i32 %arg) {
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -73,17 +73,16 @@
 ; GCN-NEXT:  BB0_4: ; %LOOP
 ; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v4
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    s_cbranch_execz BB0_3
 ; GCN-NEXT:  ; %bb.5: ; %ENDIF
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_b64 s[10:11], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; Test that image.sample instruction is sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image.
+
+; GCN-LABEL: {{^}}sinking_img_sample:
+; GCN-NOT: image_sample
+; GCN: branch
+; GCN: image_sample
+; GCN: exp null
+
+define amdgpu_ps float @sinking_img_sample() {
+main_body:
+  %i = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
+  br i1 undef, label %endif1, label %if1
+
+if1:                                              ; preds = %main_body
+  call void @llvm.amdgcn.kill(i1 false) #4
+  br label %exit
+
+endif1:                                           ; preds = %main_body
+  %i22 = extractelement <3 x float> %i, i32 2
+  %i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1
+  br label %exit
+
+exit:                                             ; preds = %endif1, %if1
+  %i24 = phi float [ undef, %if1 ], [ %i23, %endif1 ]
+  ret float %i24
+}
+; Function Attrs: nounwind readonly willreturn
+declare <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.fma.f32(float, float, float) #2
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.kill(i1) #4
+
+attributes #1 = { nounwind readnone }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind readonly willreturn }
+attributes #4 = { nounwind }