Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2931,7 +2931,7 @@ // Update EXEC, switch all done bits to 0 and all todo bits to 1. MachineInstr *InsertPt = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(NewExec); Index: test/CodeGen/AMDGPU/indirect-addressing-term.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -0,0 +1,19 @@ +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=expand-isel-pseudos < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -stop-after=expand-isel-pseudos < %s | FileCheck -check-prefixes=GCN %s + +; Verify that we consider the xor at the end of the waterfall loop emitted for +; divergent indirect addressing as a terminator. + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +; GCN-LABEL: @extract_w_offset_vgpr{{.*}} +; GCN: $exec = S_XOR_B64_term $exec, +; GCN-NEXT: S_CBRANCH_EXECNZ +define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %index = add i32 %id, 1 + %value = extractelement <16 x i32> , i32 %index + store i32 %value, i32 addrspace(1)* %out + ret void +}