diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1109,6 +1109,10 @@ bool GCNPassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); + // TODO: Fix instruction selection to do the right thing for image + // instructions with tfe or lwe in the first place, instead of running a + // separate pass to fix them up? + addPass(createSIAddIMGInitPass()); return false; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -655,6 +655,7 @@ define amdgpu_ps float @load_1d_f16_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask_x: ; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 @@ -663,13 +664,15 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_f16_tfe_dmask_x: ; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 @@ -678,13 +681,15 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe d16 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-PACKED-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe d16 ; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_f16_tfe_dmask_x: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -693,13 +698,15 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f16_tfe_dmask_x: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -708,9 +715,10 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { half, i32 } %v, 1 @@ -721,6 +729,7 @@ define amdgpu_ps float @load_1d_v2f16_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask_xy: ; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 @@ -729,13 +738,16 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 unorm tfe d16 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x3 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_v2f16_tfe_dmask_xy: ; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 @@ -744,13 +756,15 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm tfe d16 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-PACKED-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x3 unorm tfe d16 ; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v2f16_tfe_dmask_xy: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -759,13 +773,15 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm tfe d16 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x3 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f16_tfe_dmask_xy: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -774,9 +790,10 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <2 x half>, i32 } %v, 1 @@ -787,6 +804,7 @@ define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz: ; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 @@ -795,13 +813,17 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe d16 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:4], v0, s[0:7] dmask:0x7 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz: ; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 @@ -810,13 +832,16 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-PACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-PACKED-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x7 unorm tfe d16 ; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v3f16_tfe_dmask_xyz: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -825,13 +850,16 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x7 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v3f16_tfe_dmask_xyz: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -840,9 +868,11 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: ; return to shader part epilog %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <3 x half>, i32 } %v, 1 @@ -853,6 +883,7 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw: ; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 @@ -861,13 +892,15 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe d16 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw: ; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 ; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 ; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 @@ -876,13 +909,15 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 -; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe d16 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-PACKED-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe d16 ; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v4f16_tfe_dmask_xyzw: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -891,13 +926,15 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe d16 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v4f16_tfe_dmask_xyzw: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -906,9 +943,10 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 16, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <4 x half>, i32 } %v, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll @@ -466,6 +466,7 @@ define amdgpu_ps float @load_1d_f32_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX6-LABEL: load_1d_f32_tfe_dmask_x: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -474,13 +475,15 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_f32_tfe_dmask_x: ; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 s0, s2 ; GFX8-NEXT: s_mov_b32 s1, s3 ; GFX8-NEXT: s_mov_b32 s2, s4 @@ -489,13 +492,15 @@ ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f32_tfe_dmask_x: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -504,9 +509,10 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call { float, i32 } @llvm.amdgcn.image.load.1d.sl_f32i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { float, i32 } %v, 1 @@ -517,6 +523,7 @@ define amdgpu_ps float @load_1d_v2f32_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s) { ; GFX6-LABEL: load_1d_v2f32_tfe_dmask_xy: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -525,13 +532,16 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 unorm tfe +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x3 unorm tfe ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v2f32_tfe_dmask_xy: ; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 s0, s2 ; GFX8-NEXT: s_mov_b32 s1, s3 ; GFX8-NEXT: s_mov_b32 s2, s4 @@ -540,13 +550,16 @@ ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 unorm tfe +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x3 unorm tfe ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f32_tfe_dmask_xy: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -555,9 +568,11 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: ; return to shader part epilog %v = call { <2 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f32i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <2 x float>, i32 } %v, 1 @@ -568,6 +583,7 @@ define amdgpu_ps float @load_1d_v3f32_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX6-LABEL: load_1d_v3f32_tfe_dmask_xyz: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -576,13 +592,17 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: image_load v[1:4], v0, s[0:7] dmask:0x7 unorm tfe ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v3f32_tfe_dmask_xyz: ; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 s0, s2 ; GFX8-NEXT: s_mov_b32 s1, s3 ; GFX8-NEXT: s_mov_b32 s2, s4 @@ -591,13 +611,17 @@ ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: image_load v[1:4], v0, s[0:7] dmask:0x7 unorm tfe ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v3f32_tfe_dmask_xyz: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -606,9 +630,12 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: image_load v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: ; return to shader part epilog %v = call { <3 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f32i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <3 x float>, i32 } %v, 1 @@ -619,6 +646,7 @@ define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX6-LABEL: load_1d_v4f32_tfe_dmask_xyzw: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -627,13 +655,15 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v4f32_tfe_dmask_xyzw: ; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 s0, s2 ; GFX8-NEXT: s_mov_b32 s1, s3 ; GFX8-NEXT: s_mov_b32 s2, s4 @@ -642,13 +672,15 @@ ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v4f32_tfe_dmask_xyzw: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -657,9 +689,10 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 16, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <4 x float>, i32 } %v, 1 @@ -670,6 +703,7 @@ define amdgpu_ps float @load_1d_f32_tfe_dmask_0(<8 x i32> inreg %rsrc, i32 %s) { ; GFX6-LABEL: load_1d_f32_tfe_dmask_0: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -678,13 +712,15 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_f32_tfe_dmask_0: ; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 s0, s2 ; GFX8-NEXT: s_mov_b32 s1, s3 ; GFX8-NEXT: s_mov_b32 s2, s4 @@ -693,13 +729,15 @@ ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f32_tfe_dmask_0: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -708,9 +746,10 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call { float, i32 } @llvm.amdgcn.image.load.1d.sl_f32i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { float, i32 } %v, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -37,6 +37,8 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { ; GFX6-LABEL: load_2d_v4f32_xyzw_tfe: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -45,7 +47,12 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 ; GFX6-NEXT: s_mov_b32 s10, -1 @@ -57,18 +64,29 @@ ; ; GFX10-LABEL: load_2d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) @@ -81,6 +99,8 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { ; GFX6-LABEL: load_2d_v4f32_xyzw_tfe_lwe: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -89,7 +109,12 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe lwe +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe lwe ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 ; GFX6-NEXT: s_mov_b32 s10, -1 @@ -101,18 +126,29 @@ ; ; GFX10-LABEL: load_2d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -49,8 +49,15 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -59,9 +66,11 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 -; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe da -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -69,21 +78,30 @@ ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v8, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 +; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, v9 +; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -100,8 +118,15 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -110,9 +135,11 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 -; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe da -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe lwe da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -120,21 +147,30 @@ ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v8, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 +; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe lwe +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, v9 +; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -37,6 +37,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -45,7 +47,14 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe da +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe da ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 ; GFX6-NEXT: s_mov_b32 s10, -1 @@ -57,6 +66,16 @@ ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v9 +; GFX10-NEXT: v_mov_b32_e32 v11, v9 +; GFX10-NEXT: v_mov_b32_e32 v12, v9 +; GFX10-NEXT: v_mov_b32_e32 v13, v9 +; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -65,10 +84,13 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v1, v10 +; GFX10-NEXT: v_mov_b32_e32 v2, v11 +; GFX10-NEXT: v_mov_b32_e32 v3, v12 +; GFX10-NEXT: v_mov_b32_e32 v4, v13 +; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v9, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -81,6 +103,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -89,7 +113,14 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe lwe da +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe lwe da ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 ; GFX6-NEXT: s_mov_b32 s10, -1 @@ -101,6 +132,16 @@ ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v9 +; GFX10-NEXT: v_mov_b32_e32 v11, v9 +; GFX10-NEXT: v_mov_b32_e32 v12, v9 +; GFX10-NEXT: v_mov_b32_e32 v13, v9 +; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -109,10 +150,13 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe +; GFX10-NEXT: v_mov_b32_e32 v1, v10 +; GFX10-NEXT: v_mov_b32_e32 v2, v11 +; GFX10-NEXT: v_mov_b32_e32 v3, v12 +; GFX10-NEXT: v_mov_b32_e32 v4, v13 +; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v9, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -48,20 +48,29 @@ ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 -; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -69,6 +78,7 @@ ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 @@ -76,14 +86,22 @@ ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v8, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 +; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, v9 +; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -99,20 +117,29 @@ ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 -; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe lwe ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -120,6 +147,7 @@ ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 @@ -127,14 +155,22 @@ ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v8, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 +; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, v9 +; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -37,6 +37,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { ; GFX6-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -45,7 +47,13 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 ; GFX6-NEXT: s_mov_b32 s10, -1 @@ -57,7 +65,16 @@ ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, v11 +; GFX10-NEXT: v_mov_b32_e32 v13, v11 +; GFX10-NEXT: v_mov_b32_e32 v14, v11 +; GFX10-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -65,10 +82,13 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe +; GFX10-NEXT: v_mov_b32_e32 v1, v12 +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v3, v14 +; GFX10-NEXT: v_mov_b32_e32 v4, v15 +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v11, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -81,6 +101,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { ; GFX6-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -89,7 +111,13 @@ ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 ; GFX6-NEXT: s_mov_b32 s10, -1 @@ -101,7 +129,16 @@ ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, v11 +; GFX10-NEXT: v_mov_b32_e32 v13, v11 +; GFX10-NEXT: v_mov_b32_e32 v14, v11 +; GFX10-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -109,10 +146,13 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe +; GFX10-NEXT: v_mov_b32_e32 v1, v12 +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v3, v14 +; GFX10-NEXT: v_mov_b32_e32 v4, v15 +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v11, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)