Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2931,7 +2931,7 @@
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   MachineInstr *InsertPt =
-    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
     .addReg(AMDGPU::EXEC)
     .addReg(NewExec);
 
Index: test/CodeGen/AMDGPU/indirect-addressing-term.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -0,0 +1,19 @@
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=expand-isel-pseudos < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -stop-after=expand-isel-pseudos < %s | FileCheck -check-prefixes=GCN %s
+
+; Verify that we consider the xor at the end of the waterfall loop emitted for
+; divergent indirect addressing as a terminator.
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; GCN-LABEL: @extract_w_offset_vgpr{{.*}}
+; GCN: $exec = S_XOR_B64_term $exec,
+; GCN-NEXT: S_CBRANCH_EXECNZ
+define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %index = add i32 %id, 1
+  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}