Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2931,7 +2931,7 @@
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   MachineInstr *InsertPt =
-    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
     .addReg(AMDGPU::EXEC)
     .addReg(NewExec);
 
Index: test/CodeGen/AMDGPU/indirect-addressing-term.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -0,0 +1,20 @@
+; RUN: llc -O0 -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -O0 -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; Verify that we consider the xor at the end of the waterfall loop emitted for
+; divergent indirect addressing as a terminator.
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; There should be no spill code inserted between the xor and the real terminator
+; GCN-LABEL: extract_w_offset_vgpr:
+; GCN: s_xor_b64 exec, exec,
+; GCN-NEXT: s_cbranch_execnz
+define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %index = add i32 %id, 1
+  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}