diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -971,10 +971,11 @@ if (LateCFGStructurize) { addPass(createAMDGPUMachineCFGStructurizerPass()); } - addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc() { + addPass(createSIWholeQuadModePass()); + // FIXME: We have to disable the verifier here because of PHIElimination + // TwoAddressInstructions disabling it. @@ -990,6 +991,10 @@ } void GCNPassConfig::addOptimizedRegAlloc() { + // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation + // instructions that cause scheduling barriers. + insertPass(&MachineSchedulerID, &SIWholeQuadModeID); + if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -5,7 +5,11 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -15,12 +19,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 @@ -29,7 +29,10 @@ ; ; GFX10NSA-LABEL: gather4_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -39,15 +42,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -58,24 +58,24 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { ; GFX9-LABEL: gather4_cube: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da @@ -84,28 +84,28 @@ ; ; GFX10NSA-LABEL: gather4_cube: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -116,24 +116,24 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { ; GFX9-LABEL: gather4_2darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da @@ -142,28 +142,28 @@ ; ; GFX10NSA-LABEL: gather4_2darray: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -174,7 +174,11 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -184,12 +188,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -198,7 +198,10 @@ ; ; GFX10NSA-LABEL: gather4_c_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -208,15 +211,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -227,24 +227,24 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -253,28 +253,28 @@ ; ; GFX10NSA-LABEL: gather4_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -285,24 +285,24 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_c_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -311,28 +311,28 @@ ; ; GFX10NSA-LABEL: gather4_c_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -343,7 +343,11 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { ; GFX9-LABEL: gather4_b_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -353,12 +357,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -367,7 +367,10 @@ ; ; GFX10NSA-LABEL: gather4_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -377,15 +380,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -396,7 +396,11 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_b_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -406,12 +410,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -420,7 +420,10 @@ ; ; GFX10NSA-LABEL: gather4_c_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -430,15 +433,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -449,24 +449,24 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -475,28 +475,28 @@ ; ; GFX10NSA-LABEL: gather4_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -507,24 +507,24 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_c_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 @@ -533,28 +533,28 @@ ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -26,7 +26,10 @@ ; ; GFX10NSA-LABEL: gather4_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -36,12 +39,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -74,7 +74,10 @@ ; ; GFX10NSA-LABEL: gather4_cube: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -84,12 +87,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -122,7 +122,10 @@ ; ; GFX10NSA-LABEL: gather4_2darray: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -132,12 +135,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -170,7 +170,10 @@ ; ; GFX10NSA-LABEL: gather4_c_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -180,12 +183,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -218,7 +218,10 @@ ; ; GFX10NSA-LABEL: gather4_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -228,12 +231,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -266,7 +266,10 @@ ; ; GFX10NSA-LABEL: gather4_c_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -276,12 +279,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -314,7 +314,10 @@ ; ; GFX10NSA-LABEL: gather4_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -324,12 +327,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -362,7 +362,10 @@ ; ; GFX10NSA-LABEL: gather4_c_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -372,12 +375,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -410,7 +410,10 @@ ; ; GFX10NSA-LABEL: gather4_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -420,12 +423,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -458,7 +458,10 @@ ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -468,12 +471,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -674,7 +674,10 @@ ; ; GFX10NSA-LABEL: gather4_2d_dmask_2: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -684,12 +687,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -722,7 +722,10 @@ ; ; GFX10NSA-LABEL: gather4_2d_dmask_4: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -732,12 +735,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -770,7 +770,10 @@ ; ; GFX10NSA-LABEL: gather4_2d_dmask_8: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -780,12 +783,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll @@ -26,7 +26,10 @@ ; ; GFX10-LABEL: gather4_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -36,12 +39,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -74,7 +74,10 @@ ; ; GFX10-LABEL: gather4_c_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -84,12 +87,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -122,7 +122,10 @@ ; ; GFX10-LABEL: gather4_cl_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -132,12 +135,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -170,7 +170,10 @@ ; ; GFX10-LABEL: gather4_c_cl_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -180,12 +183,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -218,7 +218,10 @@ ; ; GFX10-LABEL: gather4_b_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -228,12 +231,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -266,7 +266,10 @@ ; ; GFX10-LABEL: gather4_c_b_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -276,12 +279,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -356,7 +356,10 @@ ; ; GFX10-LABEL: gather4_c_b_cl_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -366,12 +369,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -370,49 +370,49 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB2_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB2_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -423,48 +423,48 @@ ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_add_u32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB2_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -474,57 +474,61 @@ ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s3, s4 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB2_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -533,50 +537,52 @@ ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB2_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -609,49 +615,49 @@ ; GFX8-LABEL: add_i32_varying_gfx1032: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB3_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB3_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -662,48 +668,48 @@ ; GFX9-LABEL: add_i32_varying_gfx1032: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_add_u32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -713,57 +719,61 @@ ; ; GFX1064-LABEL: add_i32_varying_gfx1032: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s3, s4 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB3_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -772,50 +782,52 @@ ; ; GFX1032-LABEL: add_i32_varying_gfx1032: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB3_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -848,49 +860,49 @@ ; GFX8-LABEL: add_i32_varying_gfx1064: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -901,48 +913,48 @@ ; GFX9-LABEL: add_i32_varying_gfx1064: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_add_u32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -952,57 +964,61 @@ ; ; GFX1064-LABEL: add_i32_varying_gfx1064: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s3, s4 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB4_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1011,50 +1027,52 @@ ; ; GFX1032-LABEL: add_i32_varying_gfx1064: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_add_rtn_u32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB4_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1923,49 +1941,49 @@ ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB10_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_sub_rtn_u32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB10_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -1976,48 +1994,48 @@ ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_add_u32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_sub_rtn_u32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -2027,57 +2045,61 @@ ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s3, s4 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_sub_rtn_u32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB10_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2086,50 +2108,52 @@ ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_sub_rtn_u32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB10_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2661,48 +2685,50 @@ ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB14_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_and_rtn_b32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB14_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -2713,47 +2739,49 @@ ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB14_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_and_rtn_b32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB14_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -2763,56 +2791,63 @@ ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_and_b32_dpp v1, v0, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_e32 v0, -1 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v2 +; GFX1064-NEXT: s_mov_b32 s3, s4 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7 +; GFX1064-NEXT: ds_and_rtn_b32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB14_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_and_b32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2821,49 +2856,52 @@ ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: v_mov_b32_e32 v0, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7 +; GFX1032-NEXT: ds_and_rtn_b32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB14_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_and_b32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2896,49 +2934,49 @@ ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_or_b32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB15_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_or_rtn_b32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB15_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -2949,48 +2987,48 @@ ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_or_b32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB15_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_or_rtn_b32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB15_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -3000,57 +3038,61 @@ ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_or_b32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_or_b32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_or_b32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_or_b32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_or_b32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s3, s4 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7 +; GFX1064-NEXT: ds_or_rtn_b32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB15_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_or_b32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3059,50 +3101,52 @@ ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_or_b32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_or_b32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_or_b32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_or_b32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7 +; GFX1032-NEXT: ds_or_rtn_b32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB15_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_or_b32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3135,49 +3179,49 @@ ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_xor_b32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB16_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_xor_rtn_b32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB16_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -3188,48 +3232,48 @@ ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_xor_b32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB16_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_xor_rtn_b32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB16_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -3239,57 +3283,61 @@ ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_xor_b32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_xor_b32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_xor_b32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_xor_b32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_xor_b32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s3, s4 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7 +; GFX1064-NEXT: ds_xor_rtn_b32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB16_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3298,50 +3346,52 @@ ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_xor_b32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_xor_b32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_xor_b32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_xor_b32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7 +; GFX1032-NEXT: ds_xor_rtn_b32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB16_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3374,48 +3424,50 @@ ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB17_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_i32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB17_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_max_i32_e32 v0, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -3426,47 +3478,49 @@ ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB17_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_i32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB17_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_max_i32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -3476,56 +3530,63 @@ ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_bfrev_b32_e32 v0, 1 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v2 +; GFX1064-NEXT: s_mov_b32 s3, s4 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7 +; GFX1064-NEXT: ds_max_rtn_i32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB17_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_max_i32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3534,49 +3595,52 @@ ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1032-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7 +; GFX1032-NEXT: ds_max_rtn_i32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB17_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_max_i32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3794,48 +3858,50 @@ ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB19_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_i32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB19_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_min_i32_e32 v0, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -3846,47 +3912,49 @@ ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB19_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_i32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB19_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_min_i32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -3896,56 +3964,63 @@ ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1064-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v2 +; GFX1064-NEXT: s_mov_b32 s3, s4 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7 +; GFX1064-NEXT: ds_min_rtn_i32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB19_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_min_i32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3954,49 +4029,52 @@ ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1032-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7 +; GFX1032-NEXT: ds_min_rtn_i32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB19_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_min_i32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -4214,49 +4292,49 @@ ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8-NEXT: v_max_u32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_max_u32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_max_u32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB21_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_u32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB21_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_max_u32_e32 v0, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -4267,48 +4345,48 @@ ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-NEXT: v_max_u32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_max_u32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_max_u32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB21_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_u32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB21_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_max_u32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -4318,57 +4396,61 @@ ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_max_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_max_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_max_u32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s3, s4 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_max_rtn_u32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB21_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_max_u32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -4377,50 +4459,52 @@ ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_max_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_max_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_max_rtn_u32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB21_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_max_u32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -4635,48 +4719,50 @@ ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s4, s5 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB23_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_u32 v2, v2, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB23_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_min_u32_e32 v0, s2, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 @@ -4687,47 +4773,49 @@ ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_mov_b32_dpp v0, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, s5 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB23_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_u32 v2, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB23_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_min_u32_e32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 @@ -4737,56 +4825,63 @@ ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_min_u32_dpp v1, v0, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_e32 v0, -1 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v0, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064-NEXT: v_writelane_b32 v0, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v2 +; GFX1064-NEXT: s_mov_b32 s3, s4 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v0, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_min_rtn_u32 v2, v2, v3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB23_2: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_min_u32_e32 v0, s3, v2 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -4795,49 +4890,52 @@ ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: v_mov_b32_e32 v0, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s4, s5 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vgpr2 +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_min_rtn_u32 v2, v2, v3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB23_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_min_u32_e32 v0, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -208,229 +208,229 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[10:11], exec -; GFX8-NEXT: ; implicit-def: $vgpr3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX8-NEXT: s_cbranch_execz BB1_4 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_mov_b64 s[10:11], exec -; GFX8-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b64 exec, s[10:11] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s12, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v3, vcc, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s13, v3, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b32 s12, s13 +; GFX8-NEXT: ; implicit-def: $vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX8-NEXT: s_cbranch_execz BB1_3 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: buffer_atomic_add v2, off, s[4:7], 0 glc ; GFX8-NEXT: BB1_3: ; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: BB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX8-NEXT: s_cbranch_vccnz BB1_6 ; GFX8-NEXT: ; %bb.5: ; %if -; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX8-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[10:11], exec -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX9-NEXT: s_cbranch_execz BB1_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_mov_b64 s[10:11], exec -; GFX9-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s12, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s13, v3, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v0, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b32 s12, s13 +; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX9-NEXT: s_cbranch_execz BB1_3 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: buffer_atomic_add v2, off, s[4:7], 0 glc ; GFX9-NEXT: BB1_3: ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-NEXT: BB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz BB1_6 ; GFX9-NEXT: ; %bb.5: ; %if -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX9-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: ; implicit-def: $vgpr4 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr2 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz BB1_4 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[12:13] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s11, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v2 +; GFX1064-NEXT: v_permlanex16_b32 v0, v0, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v2, v0, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_readlane_b32 s12, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s12 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: v_readlane_b32 s12, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s13, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s12, 16 -; GFX1064-NEXT: v_readlane_b32 s12, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s13, 32 -; GFX1064-NEXT: v_readlane_b32 s13, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s13, 48 +; GFX1064-NEXT: v_writelane_b32 v0, s12, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[10:11] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s13, 32 +; GFX1064-NEXT: v_readlane_b32 s14, v2, 47 +; GFX1064-NEXT: v_readlane_b32 s15, v2, 63 +; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v2, exec_hi, v3 +; GFX1064-NEXT: s_mov_b32 s12, s15 +; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1064-NEXT: v_writelane_b32 v0, s14, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: ; implicit-def: $vgpr2 ; GFX1064-NEXT: s_and_saveexec_b64 s[30:31], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_3 ; GFX1064-NEXT: ; %bb.2: -; GFX1064-NEXT: v_mov_b32_e32 v0, s12 -; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064-NEXT: v_mov_b32_e32 v2, s12 +; GFX1064-NEXT: buffer_atomic_add v2, off, s[4:7], 0 glc ; GFX1064-NEXT: BB1_3: ; GFX1064-NEXT: v_nop ; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v2, s4, v2 ; GFX1064-NEXT: BB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX1064-NEXT: s_cbranch_vccnz BB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if -; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX1064-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr4 +; GFX1032-NEXT: ; implicit-def: $vgpr2 ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz BB1_4 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s10 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s9, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v2, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s10, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s11, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s11, 16 +; GFX1032-NEXT: v_mov_b32_dpp v0, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s10, v2, 15 +; GFX1032-NEXT: v_readlane_b32 s11, v2, 31 ; GFX1032-NEXT: s_mov_b32 exec_lo, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1032-NEXT: v_writelane_b32 v0, s10, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s9 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_mov_b32 s10, s11 +; GFX1032-NEXT: ; implicit-def: $vgpr2 ; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_3 ; GFX1032-NEXT: ; %bb.2: -; GFX1032-NEXT: v_mov_b32_e32 v0, s10 -; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032-NEXT: v_mov_b32_e32 v2, s10 +; GFX1032-NEXT: buffer_atomic_add v2, off, s[4:7], 0 glc ; GFX1032-NEXT: BB1_3: ; GFX1032-NEXT: v_nop ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v2, s4, v2 ; GFX1032-NEXT: BB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz BB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if -; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX1032-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -27,10 +27,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b64 [[INIT_EXEC:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], ; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]] -; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]] -; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]] +; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], +; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]] ; CHECK-NEXT: s_cbranch_execz define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -17,12 +17,12 @@ ; GFX10-LABEL: gather4_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -45,12 +45,12 @@ ; GFX10-LABEL: gather4_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -73,12 +73,12 @@ ; GFX10-LABEL: gather4_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -101,12 +101,12 @@ ; GFX10-LABEL: gather4_c_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -129,12 +129,12 @@ ; GFX10-LABEL: gather4_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -159,12 +159,12 @@ ; GFX10-LABEL: gather4_c_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -187,12 +187,12 @@ ; GFX10-LABEL: gather4_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -215,12 +215,12 @@ ; GFX10-LABEL: gather4_c_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -245,12 +245,12 @@ ; GFX10-LABEL: gather4_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -276,12 +276,12 @@ ; GFX10-LABEL: gather4_c_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -15,10 +15,10 @@ ; GFX10-LABEL: sample_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -41,12 +41,12 @@ ; GFX10-LABEL: sample_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -69,12 +69,12 @@ ; GFX10-LABEL: sample_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -97,12 +97,12 @@ ; GFX10-LABEL: sample_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -125,12 +125,12 @@ ; GFX10-LABEL: sample_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -153,12 +153,12 @@ ; GFX10-LABEL: sample_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -179,10 +179,10 @@ ; GFX10-LABEL: sample_c_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -205,12 +205,12 @@ ; GFX10-LABEL: sample_c_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -233,12 +233,12 @@ ; GFX10-LABEL: sample_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -261,12 +261,12 @@ ; GFX10-LABEL: sample_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -289,12 +289,12 @@ ; GFX10-LABEL: sample_c_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -319,12 +319,12 @@ ; GFX10-LABEL: sample_c_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -345,10 +345,10 @@ ; GFX10-LABEL: sample_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -371,12 +371,12 @@ ; GFX10-LABEL: sample_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -397,10 +397,10 @@ ; GFX10-LABEL: sample_c_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -423,12 +423,12 @@ ; GFX10-LABEL: sample_c_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -451,12 +451,12 @@ ; GFX10-LABEL: sample_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -481,12 +481,12 @@ ; GFX10-LABEL: sample_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -509,12 +509,12 @@ ; GFX10-LABEL: sample_c_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -540,12 +540,12 @@ ; GFX10-LABEL: sample_c_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -35,10 +35,10 @@ ; GFX10-LABEL: image_sample_2d_f16: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -52,14 +52,14 @@ ; TONGA-NEXT: s_mov_b64 s[14:15], exec ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: v_mov_b32_e32 v2, 0 -; TONGA-NEXT: v_mov_b32_e32 v4, s12 -; TONGA-NEXT: v_mov_b32_e32 v5, s13 ; TONGA-NEXT: v_mov_b32_e32 v3, v2 -; TONGA-NEXT: s_and_b64 exec, exec, s[14:15] ; TONGA-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; TONGA-NEXT: s_and_b64 exec, exec, s[14:15] +; TONGA-NEXT: v_mov_b32_e32 v0, s12 +; TONGA-NEXT: v_mov_b32_e32 v1, s13 ; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: flat_store_dword v[0:1], v3 ; TONGA-NEXT: v_mov_b32_e32 v0, v2 -; TONGA-NEXT: flat_store_dword v[4:5], v3 ; TONGA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; TONGA-NEXT: ; return to shader part epilog ; @@ -68,14 +68,14 @@ ; GFX81-NEXT: s_mov_b64 s[14:15], exec ; GFX81-NEXT: s_wqm_b64 exec, exec ; GFX81-NEXT: v_mov_b32_e32 v2, 0 -; GFX81-NEXT: v_mov_b32_e32 v4, s12 -; GFX81-NEXT: v_mov_b32_e32 v5, s13 ; GFX81-NEXT: v_mov_b32_e32 v3, v2 -; GFX81-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX81-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; GFX81-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX81-NEXT: v_mov_b32_e32 v0, s12 +; GFX81-NEXT: v_mov_b32_e32 v1, s13 ; GFX81-NEXT: s_waitcnt vmcnt(0) +; GFX81-NEXT: flat_store_dword v[0:1], v3 ; GFX81-NEXT: v_mov_b32_e32 v0, v2 -; GFX81-NEXT: flat_store_dword v[4:5], v3 ; GFX81-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; @@ -84,31 +84,32 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: global_store_dword v[4:5], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: image_sample_2d_f16_tfe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s14, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: v_nop +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v3, off ; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: global_store_dword v[4:5], v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -241,10 +242,10 @@ ; GFX10-LABEL: image_sample_b_2d_v4f16: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -263,8 +264,8 @@ ; TONGA-NEXT: v_mov_b32_e32 v5, v3 ; TONGA-NEXT: v_mov_b32_e32 v6, v3 ; TONGA-NEXT: v_mov_b32_e32 v7, v3 -; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 +; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v6 @@ -280,8 +281,8 @@ ; GFX81-NEXT: v_mov_b32_e32 v3, 0 ; GFX81-NEXT: v_mov_b32_e32 v4, v3 ; GFX81-NEXT: v_mov_b32_e32 v5, v3 -; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX81-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 +; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: v_mov_b32_e32 v0, v3 ; GFX81-NEXT: v_mov_b32_e32 v1, v4 @@ -295,8 +296,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 @@ -306,7 +307,6 @@ ; GFX10-LABEL: image_sample_b_2d_v4f16_tfe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -314,8 +314,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: v_nop +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -25,10 +25,10 @@ ; GFX10-LABEL: sample_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -39,18 +39,18 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { ; VERDE-LABEL: sample_1d_tfe: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: s_mov_b64 s[16:17], exec +; VERDE-NEXT: s_mov_b64 s[14:15], exec ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: v_mov_b32_e32 v5, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 -; VERDE-NEXT: s_mov_b32 s15, 0xf000 -; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v3, v0 ; VERDE-NEXT: v_mov_b32_e32 v4, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[16:17] ; VERDE-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[14:15] +; VERDE-NEXT: s_mov_b32 s15, 0xf000 +; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -62,36 +62,37 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, s12 -; GFX6789-NEXT: v_mov_b32_e32 v7, s13 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6789-NEXT: v_mov_b32_e32 v5, s12 +; GFX6789-NEXT: v_mov_b32_e32 v6, s13 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[6:7], v4, off +; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s12 ; encoding: [0x0c,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, s13 ; encoding: [0x0d,0x02,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] +; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -110,8 +111,8 @@ ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; @@ -122,21 +123,22 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x01,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -158,8 +160,8 @@ ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; @@ -170,21 +172,22 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x02,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -206,8 +209,8 @@ ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; @@ -218,21 +221,22 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_3: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x04,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -254,8 +258,8 @@ ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; @@ -266,21 +270,22 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_4: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x08,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -303,8 +308,8 @@ ; VERDE-NEXT: v_mov_b32_e32 v0, 0 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: v_mov_b32_e32 v2, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; @@ -316,22 +321,23 @@ ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_12: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x03,0x81,0xf0,0x03,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -356,8 +362,8 @@ ; VERDE-NEXT: v_mov_b32_e32 v0, 0 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: v_mov_b32_e32 v2, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; @@ -369,22 +375,23 @@ ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_24: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0a,0x81,0xf0,0x03,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -410,8 +417,8 @@ ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v3, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe +; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; @@ -424,23 +431,24 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe +; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_134: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0d,0x81,0xf0,0x04,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -461,18 +469,18 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { ; VERDE-LABEL: sample_1d_lwe: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: s_mov_b64 s[16:17], exec +; VERDE-NEXT: s_mov_b64 s[14:15], exec ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: v_mov_b32_e32 v5, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 -; VERDE-NEXT: s_mov_b32 s15, 0xf000 -; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v3, v0 ; VERDE-NEXT: v_mov_b32_e32 v4, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[16:17] ; VERDE-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe +; VERDE-NEXT: s_and_b64 exec, exec, s[14:15] +; VERDE-NEXT: s_mov_b32 s15, 0xf000 +; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -484,36 +492,37 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, s12 -; GFX6789-NEXT: v_mov_b32_e32 v7, s13 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 -; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe +; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6789-NEXT: v_mov_b32_e32 v5, s12 +; GFX6789-NEXT: v_mov_b32_e32 v6, s13 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[6:7], v4, off +; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_lwe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s12 ; encoding: [0x0c,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, s13 ; encoding: [0x0d,0x02,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] +; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -546,10 +555,10 @@ ; GFX10-LABEL: sample_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -579,10 +588,10 @@ ; GFX10-LABEL: sample_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -612,10 +621,10 @@ ; GFX10-LABEL: sample_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; encoding: [0x18,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -645,10 +654,10 @@ ; GFX10-LABEL: sample_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x20,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -678,10 +687,10 @@ ; GFX10-LABEL: sample_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -711,10 +720,10 @@ ; GFX10-LABEL: sample_c_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -744,10 +753,10 @@ ; GFX10-LABEL: sample_c_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -777,10 +786,10 @@ ; GFX10-LABEL: sample_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -810,10 +819,10 @@ ; GFX10-LABEL: sample_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -843,10 +852,10 @@ ; GFX10-LABEL: sample_c_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -876,10 +885,10 @@ ; GFX10-LABEL: sample_c_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -909,10 +918,10 @@ ; GFX10-LABEL: sample_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -942,10 +951,10 @@ ; GFX10-LABEL: sample_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -975,10 +984,10 @@ ; GFX10-LABEL: sample_c_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1008,10 +1017,10 @@ ; GFX10-LABEL: sample_c_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1041,10 +1050,10 @@ ; GFX10-LABEL: sample_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1074,10 +1083,10 @@ ; GFX10-LABEL: sample_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1107,10 +1116,10 @@ ; GFX10-LABEL: sample_c_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1140,10 +1149,10 @@ ; GFX10-LABEL: sample_c_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1896,10 +1905,10 @@ ; GFX10-LABEL: sample_1d_unorm: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1929,10 +1938,10 @@ ; GFX10-LABEL: sample_1d_glc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; encoding: [0x00,0x2f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1962,10 +1971,10 @@ ; GFX10-LABEL: sample_1d_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; encoding: [0x00,0x0f,0x80,0xf2,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1995,10 +2004,10 @@ ; GFX10-LABEL: sample_1d_glc_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc slc ; encoding: [0x00,0x2f,0x80,0xf2,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2028,10 +2037,10 @@ ; GFX10-LABEL: adjust_writemask_sample_0: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2062,10 +2071,10 @@ ; GFX10-LABEL: adjust_writemask_sample_01: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x03,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2096,10 +2105,10 @@ ; GFX10-LABEL: adjust_writemask_sample_012: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x07,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2130,10 +2139,10 @@ ; GFX10-LABEL: adjust_writemask_sample_12: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2164,10 +2173,10 @@ ; GFX10-LABEL: adjust_writemask_sample_03: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x09,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2198,10 +2207,10 @@ ; GFX10-LABEL: adjust_writemask_sample_13: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2232,10 +2241,10 @@ ; GFX10-LABEL: adjust_writemask_sample_123: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0e,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2284,10 +2293,10 @@ ; GFX10-LABEL: adjust_writemask_sample_123_to_12: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2318,10 +2327,10 @@ ; GFX10-LABEL: adjust_writemask_sample_013_to_13: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -19,8 +19,9 @@ ; CHECK-LABEL: {{^}}test2: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_mov_b64 [[LIVE2:s\[[0-9]+:[0-9]+\]]], [[LIVE]] ; CHECK-DAG: s_wqm_b64 exec, exec -; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] +; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE2]] ; CHECK: image_sample v0, [[VAR]], define amdgpu_ps float @test2() #0 { %live = call i1 @llvm.amdgcn.ps.live() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -61,10 +61,10 @@ ;CHECK: s_wqm_b64 exec, exec ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: buffer_store_dword ;CHECK; s_wqm_b64 exec, exec -;CHECK: v_add_f32_e32 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -266,10 +266,8 @@ ; if..then..endif version ; ;CHECK-LABEL: {{^}}test_wwm6_then: -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: %if ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword @@ -300,10 +298,8 @@ ; loop version ; ;CHECK-LABEL: {{^}}test_wwm6_loop: -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: %loop ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword @@ -354,8 +350,8 @@ ; Check that enabling WQM anywhere enables WQM for the set.inactive source. ; ;CHECK-LABEL: {{^}}test_set_inactive2: -;CHECK: s_wqm_b64 exec, exec ;CHECK: buffer_load_dword +;CHECK: s_wqm_b64 exec, exec ;CHECK: buffer_load_dword define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { main_body: @@ -421,9 +417,8 @@ ;CHECK: image_sample ;CHECK: image_sample ;CHECK: %Flow -;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] -;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] +;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] ;CHECK-NEXT: s_cbranch_execz [[END_BB:BB[0-9]+_[0-9]+]] ;CHECK-NEXT: ; %bb.{{[0-9]+}}: ; %ELSE diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -10,16 +10,16 @@ %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) -; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] -; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] +; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] +; GFX9-DAG: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) %tmp121 = add i32 %tmp105, %tmp120 %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) -; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] -; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] +; GFX9-DAG: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DAG: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] +; GFX9-DAG: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) %tmp136 = add i32 %tmp107, %tmp135 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)