Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2931,7 +2931,7 @@ // Update EXEC, switch all done bits to 0 and all todo bits to 1. MachineInstr *InsertPt = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(NewExec); Index: test/CodeGen/AMDGPU/indirect-addressing-term.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -0,0 +1,20 @@ +; RUN: llc -O0 -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -O0 -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; Verify that we consider the xor at the end of the waterfall loop emitted for +; divergent indirect addressing as a terminator. + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +; There should be no spill code inserted between the xor and the real terminator +; GCN-LABEL: extract_w_offset_vgpr: +; GCN: s_xor_b64 exec, exec, +; GCN-NEXT: s_cbranch_execnz +define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %index = add i32 %id, 1 + %value = extractelement <16 x i32> , i32 %index + store i32 %value, i32 addrspace(1)* %out + ret void +}