diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1039,6 +1039,7 @@ // FIXME: We need to run a pass to propagate the attributes when calls are // supported. + addPass(createSinkingPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); @@ -1049,7 +1050,6 @@ } addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } - addPass(createSinkingPass()); addPass(createAMDGPUAnnotateUniformValues()); if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -6,11 +6,11 @@ ; with exec. ; GCN-LABEL: {{^}}needs_and: -; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}} -; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]] -; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]], -; GCN: s_andn2_b64 exec, exec, [[REG3]] +; GCN: s_or_b64 exec, exec, [[REG1:[^ ,]*]] +; GCN: s_andn2_b64 exec, exec, [[REG2:[^ ,]*]] +; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1:[^ ,]*]], [[REG2:[^ ,]*]] +; GCN: s_or_b64 exec, exec, [[REG2:[^ ,]*]] define void @needs_and(i32 %arg) { entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -73,17 +73,16 @@ ; GCN-NEXT: BB0_4: ; %LOOP ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v0, v4 ; GCN-NEXT: s_or_b64 s[2:3], s[2:3], exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_cbranch_execz BB0_3 ; GCN-NEXT: ; %bb.5: ; %ENDIF ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, v5, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, v5, v0 ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[10:11], vcc, exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] @@ -196,7 +195,7 @@ ; GCN-NEXT: s_cbranch_execz BB1_9 ; GCN-NEXT: BB1_2: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; GCN-NEXT: s_mov_b64 s[6:7], -1 @@ -213,7 +212,7 @@ ; GCN-NEXT: s_cbranch_vccz BB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 ; GCN-NEXT: s_mov_b64 s[8:9], 0 @@ -233,7 +232,7 @@ ; GCN-NEXT: s_cbranch_vccz BB1_1 ; GCN-NEXT: ; %bb.8: ; %case0 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], 0 ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1