diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1039,6 +1039,7 @@ // FIXME: We need to run a pass to propagate the attributes when calls are // supported. + addPass(createSinkingPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); @@ -1049,7 +1050,6 @@ } addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } - addPass(createSinkingPass()); addPass(createAMDGPUAnnotateUniformValues()); if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -6,11 +6,11 @@ ; with exec. ; GCN-LABEL: {{^}}needs_and: -; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}} -; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]] -; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]], -; GCN: s_andn2_b64 exec, exec, [[REG3]] +; GCN: s_or_b64 exec, exec, [[REG1:[^ ,]*]] +; GCN: s_andn2_b64 exec, exec, [[REG2:[^ ,]*]] +; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1:[^ ,]*]], [[REG2:[^ ,]*]] +; GCN: s_or_b64 exec, exec, [[REG2:[^ ,]*]] define void @needs_and(i32 %arg) { entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -73,17 +73,16 @@ ; GCN-NEXT: BB0_4: ; %LOOP ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v0, v4 ; GCN-NEXT: s_or_b64 s[2:3], s[2:3], exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_cbranch_execz BB0_3 ; GCN-NEXT: ; %bb.5: ; %ENDIF ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, v5, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, v5, v0 ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[10:11], vcc, exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] @@ -196,7 +195,7 @@ ; GCN-NEXT: s_cbranch_execz BB1_9 ; GCN-NEXT: BB1_2: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; GCN-NEXT: s_mov_b64 s[6:7], -1 @@ -213,7 +212,7 @@ ; GCN-NEXT: s_cbranch_vccz BB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 ; GCN-NEXT: s_mov_b64 s[8:9], 0 @@ -233,7 +232,7 @@ ; GCN-NEXT: s_cbranch_vccz BB1_1 ; GCN-NEXT: ; %bb.8: ; %case0 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], 0 ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll @@ -0,0 +1,40 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}sinking_img_sample: +; GCN-NOT: image_sample +; GCN: branch +; GCN: image_sample +; GCN: exp null + +define amdgpu_ps float @sinking_img_sample() { +main_body: + %i = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) + br i1 undef, label %endif1, label %if1 + +if1: ; preds = %main_body + call void @llvm.amdgcn.kill(i1 false) #4 + br label %exit + +endif1: ; preds = %main_body + %i22 = extractelement <3 x float> %i, i32 2 + %i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1 + br label %exit + +exit: ; preds = %endif1, %if1 + %i24 = phi float [ undef, %if1 ], [ %i23, %endif1 ] + ret float %i24 +} +; Function Attrs: nounwind readonly willreturn +declare <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 + +; Function Attrs: nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.fma.f32(float, float, float) #2 + +; Function Attrs: nounwind +declare void @llvm.amdgcn.kill(i1) #4 + +attributes #1 = { nounwind readnone } +attributes #2 = { nofree nosync nounwind readnone speculatable willreturn } +attributes #3 = { nounwind readonly willreturn } +attributes #4 = { nounwind }