Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -249,6 +249,7 @@ case ISD::STORE: case ISD::BUILD_VECTOR: case ISD::BITCAST: + case ISD::UNDEF: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: case ISD::EXTRACT_SUBVECTOR: @@ -516,6 +517,7 @@ case ISD::STORE: case ISD::BUILD_VECTOR: case ISD::BITCAST: + case ISD::UNDEF: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: Index: llvm/test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -5,14 +5,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; SI-LABEL: main: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, s0 -; SI-NEXT: s_mov_b32 s2, s0 -; SI-NEXT: s_mov_b32 s3, s0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s0 -; SI-NEXT: s_mov_b32 s6, s0 -; SI-NEXT: s_mov_b32 s7, s0 ; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 7, v0 @@ -26,14 +18,6 @@ ; ; VI-LABEL: main: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_mov_b32 s1, s0 -; VI-NEXT: s_mov_b32 s2, s0 -; VI-NEXT: s_mov_b32 s3, s0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s0 -; VI-NEXT: s_mov_b32 s6, s0 -; VI-NEXT: s_mov_b32 s7, s0 ; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 7, v0 Index: llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -213,7 +213,7 @@ br label %if.end if.end: ; preds = %if.else, %if.then - %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ] + %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ] store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef ret void } @@ -266,7 +266,7 @@ br label %if.end if.end: ; preds = %if.else, %if.then - %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ] + %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ] store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef ret void } Index: llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -4,16 +4,8 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GCN-LABEL: _amdgpu_ps_main: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s0 -; GCN-NEXT: s_mov_b32 s6, s0 -; GCN-NEXT: s_mov_b32 s7, s0 ; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D Index: llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -97,14 +97,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: s_branch .LBB0_4 ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB0_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -237,14 +230,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: s_branch .LBB1_4 ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB1_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -377,14 +363,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: s_branch .LBB2_4 ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB2_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -555,22 +534,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-NEXT: s_branch .LBB3_4 ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB3_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc @@ -743,22 +707,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: s_branch .LBB4_4 ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB4_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc @@ -931,22 +880,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-NEXT: s_branch .LBB5_4 ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB5_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -382,18 +382,10 @@ ; GCN-LABEL: insertelement_to_sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, 0 -; GCN-NEXT: s_mov_b32 s4, s12 -; GCN-NEXT: s_mov_b32 s5, s12 -; GCN-NEXT: s_mov_b32 s6, s12 -; GCN-NEXT: s_mov_b32 s7, s12 -; GCN-NEXT: s_mov_b32 s8, s12 -; GCN-NEXT: s_mov_b32 s9, s12 -; GCN-NEXT: s_mov_b32 s10, s12 -; GCN-NEXT: s_mov_b32 s11, s12 -; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef Index: llvm/test/CodeGen/AMDGPU/select-undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-undef.ll +++ llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}select_undef_lhs: ; GCN: s_waitcnt @@ -43,3 +43,220 @@ } declare float @llvm.amdgcn.rcp.f32(float) + + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v6f32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef + %add = fadd <6 x float> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <6 x float> %add, <6 x float> addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}undef_v6i32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef + %add = add <6 x i32> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v5f32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef + %add = fadd <5 x float> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <5 x float> %add, <5 x float> addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}undef_v5i32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef + %add = add <5 x i32> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v3f64: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr + %add = fadd <3 x double> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v3i64: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr + %add = add <3 x i64> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v4f16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr + %add = fadd <4 x half> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v4i16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr + %add = add <4 x i16> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v2f16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr + %add = fadd <2 x half> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v2i16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr + %add = add <2 x i16> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr + ret void +} + +; We were expanding undef vectors into zero vectors. Optimizations +; would then see we used no elements of the vector, and reform the +; undef vector resulting in a combiner loop. +; GCN-LABEL: {{^}}inf_loop_undef_vector: +; GCN: s_waitcnt +; GCN-NEXT: v_mad_u64_u32 +; GCN-NEXT: v_mul_lo_u32 +; GCN-NEXT: v_mul_lo_u32 +; GCN-NEXT: v_add3_u32 +; GCN-NEXT: global_store_dwordx2 +define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) { + %i = insertelement <6 x float> %arg, float %arg1, i64 2 + %i3 = bitcast <6 x float> %i to <3 x i64> + %i4 = extractelement <3 x i64> %i3, i64 0 + %i5 = extractelement <3 x i64> %i3, i64 1 + %i6 = mul i64 %i5, %arg2 + %i7 = add i64 %i6, %i4 + store volatile i64 %i7, i64 addrspace(1)* undef, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1406,28 +1406,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 { ; SI-LABEL: if_after_kill_block: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec -; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_cbranch_scc0 .LBB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 ; SI-NEXT: s_andn2_b64 exec, exec, vcc ; SI-NEXT: .LBB13_3: ; %bb4 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_mov_b32 s1, s0 -; SI-NEXT: s_mov_b32 s2, s0 -; SI-NEXT: s_mov_b32 s3, s0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s0 -; SI-NEXT: s_mov_b32 s6, s0 -; SI-NEXT: s_mov_b32 s7, s0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -1448,28 +1440,20 @@ ; ; GFX10-WAVE64-LABEL: if_after_kill_block: ; GFX10-WAVE64: ; %bb.0: ; %bb -; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec ; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc ; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s3, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s4, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s5, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s6, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s7, s0 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-WAVE64-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -1488,28 +1472,20 @@ ; ; GFX10-WAVE32-LABEL: if_after_kill_block: ; GFX10-WAVE32: ; %bb.0: ; %bb -; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo +; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo ; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s3, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s4, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s5, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s6, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s7, s0 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-WAVE32-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 @@ -1528,29 +1504,22 @@ ; ; GFX11-LABEL: if_after_kill_block: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_wqm_b64 exec, exec -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b64 s[4:5], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11-NEXT: s_cbranch_execz .LBB13_3 ; GFX11-NEXT: ; %bb.1: ; %bb3 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX11-NEXT: ; %bb.2: ; %bb3 ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc ; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1593,19 +1562,11 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; SI-LABEL: cbranch_kill: ; SI: ; %bb.0: ; %.entry -; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_mov_b32 s5, s4 -; SI-NEXT: s_mov_b32 s6, s4 -; SI-NEXT: s_mov_b32 s7, s4 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s4 -; SI-NEXT: s_mov_b32 s10, s4 -; SI-NEXT: s_mov_b32 s11, s4 -; SI-NEXT: image_sample_l v1, v[1:4], s[4:11], s[0:3] dmask:0x1 da +; SI-NEXT: image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -1636,16 +1597,8 @@ ; GFX10-WAVE64-LABEL: cbranch_kill: ; GFX10-WAVE64: ; %bb.0: ; %.entry ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE64-NEXT: s_mov_b32 s4, 0 ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec -; GFX10-WAVE64-NEXT: s_mov_b32 s5, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s6, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s7, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s8, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s9, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s10, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s11, s4 -; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -1676,16 +1629,8 @@ ; GFX10-WAVE32-LABEL: cbranch_kill: ; GFX10-WAVE32: ; %bb.0: ; %.entry ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE32-NEXT: s_mov_b32 s4, 0 ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo -; GFX10-WAVE32-NEXT: s_mov_b32 s5, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s6, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s7, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s8, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s9, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s10, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s11, s4 -; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo @@ -1716,16 +1661,8 @@ ; GFX11-LABEL: cbranch_kill: ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: s_mov_b32 s5, s4 -; GFX11-NEXT: s_mov_b32 s6, s4 -; GFX11-NEXT: s_mov_b32 s7, s4 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s4 -; GFX11-NEXT: s_mov_b32 s10, s4 -; GFX11-NEXT: s_mov_b32 s11, s4 -; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmpx_ge_f32_e32 0, v1 Index: llvm/test/CodeGen/AMDGPU/v1024.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v1024.ll +++ llvm/test/CodeGen/AMDGPU/v1024.ll @@ -10,6 +10,7 @@ entry: %alloca = alloca <32 x i32>, align 16, addrspace(5) %cast = bitcast <32 x i32> addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %cast, i8 0, i32 128, i1 false) br i1 undef, label %if.then.i.i, label %if.else.i if.then.i.i: ; preds = %entry @@ -24,6 +25,7 @@ ret void } +declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture readonly, i8, i32, i1 immarg) declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg) declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg) Index: llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -14,7 +14,6 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v36, v16 @@ -22,13 +21,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v34, v14 ; GFX9-NEXT: v_mov_b32_e32 v33, v13 ; GFX9-NEXT: v_mov_b32_e32 v32, v12 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -82,16 +74,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_mov_b32 s6, s4 -; GFX10-NEXT: s_mov_b32 s7, s4 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s4 -; GFX10-NEXT: s_mov_b32 s10, s4 -; GFX10-NEXT: s_mov_b32 s11, s4 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -145,16 +129,8 @@ ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 @@ -225,65 +201,41 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s36, 2 -; GFX9-NEXT: v_writelane_b32 v40, s37, 3 -; GFX9-NEXT: v_writelane_b32 v40, s38, 4 -; GFX9-NEXT: v_writelane_b32 v40, s39, 5 -; GFX9-NEXT: v_writelane_b32 v40, s40, 6 -; GFX9-NEXT: v_writelane_b32 v40, s41, 7 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v40, s42, 8 -; GFX9-NEXT: s_mov_b32 s36, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s43, 9 ; GFX9-NEXT: v_mov_b32_e32 v45, v16 ; GFX9-NEXT: v_mov_b32_e32 v44, v15 ; GFX9-NEXT: v_mov_b32_e32 v43, v14 ; GFX9-NEXT: v_mov_b32_e32 v42, v13 ; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: s_mov_b32 s37, s36 -; GFX9-NEXT: s_mov_b32 s38, s36 -; GFX9-NEXT: s_mov_b32 s39, s36 -; GFX9-NEXT: s_mov_b32 s40, s36 -; GFX9-NEXT: s_mov_b32 s41, s36 -; GFX9-NEXT: s_mov_b32 s42, s36 -; GFX9-NEXT: s_mov_b32 s43, s36 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s43, v40, 9 -; GFX9-NEXT: v_readlane_b32 s42, v40, 8 -; GFX9-NEXT: v_readlane_b32 s41, v40, 7 -; GFX9-NEXT: v_readlane_b32 s40, v40, 6 -; GFX9-NEXT: v_readlane_b32 s39, v40, 5 -; GFX9-NEXT: v_readlane_b32 s38, v40, 4 -; GFX9-NEXT: v_readlane_b32 s37, v40, 3 -; GFX9-NEXT: v_readlane_b32 s36, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -298,66 +250,42 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: v_mov_b32_e32 v42, v15 ; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_mov_b32_e32 v45, v12 -; GFX10-NEXT: v_writelane_b32 v40, s36, 2 -; GFX10-NEXT: s_mov_b32 s36, 0 -; GFX10-NEXT: v_writelane_b32 v40, s37, 3 -; GFX10-NEXT: s_mov_b32 s37, s36 -; GFX10-NEXT: v_writelane_b32 v40, s38, 4 -; GFX10-NEXT: s_mov_b32 s38, s36 -; GFX10-NEXT: v_writelane_b32 v40, s39, 5 -; GFX10-NEXT: s_mov_b32 s39, s36 -; GFX10-NEXT: v_writelane_b32 v40, s40, 6 -; GFX10-NEXT: s_mov_b32 s40, s36 -; GFX10-NEXT: v_writelane_b32 v40, s41, 7 -; GFX10-NEXT: s_mov_b32 s41, s36 -; GFX10-NEXT: v_writelane_b32 v40, s42, 8 -; GFX10-NEXT: s_mov_b32 s42, s36 -; GFX10-NEXT: v_writelane_b32 v40, s43, 9 -; GFX10-NEXT: s_mov_b32 s43, s36 -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_clause 0x4 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s43, v40, 9 -; GFX10-NEXT: v_readlane_b32 s42, v40, 8 -; GFX10-NEXT: v_readlane_b32 s41, v40, 7 -; GFX10-NEXT: v_readlane_b32 s40, v40, 6 -; GFX10-NEXT: v_readlane_b32 s39, v40, 5 -; GFX10-NEXT: v_readlane_b32 s38, v40, 4 -; GFX10-NEXT: v_readlane_b32 s37, v40, 3 -; GFX10-NEXT: v_readlane_b32 s36, v40, 2 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -372,7 +300,7 @@ ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 @@ -380,56 +308,32 @@ ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v45, s33 +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 ; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_mov_b32_e32 v45, v12 -; GFX11-NEXT: v_writelane_b32 v40, s36, 2 -; GFX11-NEXT: s_mov_b32 s36, 0 -; GFX11-NEXT: v_writelane_b32 v40, s37, 3 -; GFX11-NEXT: s_mov_b32 s37, s36 -; GFX11-NEXT: v_writelane_b32 v40, s38, 4 -; GFX11-NEXT: s_mov_b32 s38, s36 -; GFX11-NEXT: v_writelane_b32 v40, s39, 5 -; GFX11-NEXT: s_mov_b32 s39, s36 -; GFX11-NEXT: v_writelane_b32 v40, s40, 6 -; GFX11-NEXT: s_mov_b32 s40, s36 -; GFX11-NEXT: v_writelane_b32 v40, s41, 7 -; GFX11-NEXT: s_mov_b32 s41, s36 -; GFX11-NEXT: v_writelane_b32 v40, s42, 8 -; GFX11-NEXT: s_mov_b32 s42, s36 -; GFX11-NEXT: v_writelane_b32 v40, s43, 9 -; GFX11-NEXT: s_mov_b32 s43, s36 -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_load_b32 v45, off, s33 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 -; GFX11-NEXT: v_readlane_b32 s43, v40, 9 -; GFX11-NEXT: v_readlane_b32 s42, v40, 8 -; GFX11-NEXT: v_readlane_b32 s41, v40, 7 -; GFX11-NEXT: v_readlane_b32 s40, v40, 6 -; GFX11-NEXT: v_readlane_b32 s39, v40, 5 -; GFX11-NEXT: v_readlane_b32 s38, v40, 4 -; GFX11-NEXT: v_readlane_b32 s37, v40, 3 -; GFX11-NEXT: v_readlane_b32 s36, v40, 2 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 Index: llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll +++ llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll @@ -6,7 +6,7 @@ define amdgpu_cs void @xyz () { ; CHECK-LABEL: xyz: -; CHECK: v_wmma_f32_16x16x16_f16 v[0:3], v[4:11], v[4:11], v[0:3] +; CHECK: v_wmma_f32_16x16x16_f16 v[0:3], v[0:7], v[0:7], v[0:3] .entry: br label %loop Index: llvm/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wqm.ll +++ llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1833,87 +1833,54 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX9-W64-LABEL: test_loop_vcc: ; GFX9-W64: ; %bb.0: ; %entry -; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-W64-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 ; GFX9-W64-NEXT: s_branch .LBB31_2 ; GFX9-W64-NEXT: .LBB31_1: ; %body ; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX9-W64-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 ; GFX9-W64-NEXT: .LBB31_2: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s10, v8 +; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 ; GFX9-W64-NEXT: ; %bb.3: -; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8 ; GFX9-W64-NEXT: .LBB31_4: ; %break -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_loop_vcc: ; GFX10-W32: ; %bb.0: ; %entry -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_branch .LBB31_2 ; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB31_1: ; %body ; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 -; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_mov_b32 s1, 0 +; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 ; GFX10-W32-NEXT: .LBB31_2: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1925,11 +1892,10 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 ; GFX10-W32-NEXT: ; %bb.3: -; GFX10-W32-NEXT: s_mov_b32 s1, -1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8 ; GFX10-W32-NEXT: .LBB31_4: ; %break -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 @@ -1999,14 +1965,6 @@ ; GFX9-W64-NEXT: v_lshl_add_u32 v0, v2, 2, v0 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2035,14 +1993,6 @@ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v0, v2, s[8:11], 0 offen ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2079,18 +2029,10 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; GFX9-W64-LABEL: test_nonvoid_return: ; GFX9-W64: ; %bb.0: -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2098,18 +2040,10 @@ ; ; GFX10-W32-LABEL: test_nonvoid_return: ; GFX10-W32: ; %bb.0: -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2128,20 +2062,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { ; GFX9-W64-LABEL: test_nonvoid_return_unreachable: ; GFX9-W64: ; %bb.0: ; %entry -; GFX9-W64-NEXT: s_mov_b32 s4, 0 -; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, exec ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB34_2 ; GFX9-W64-NEXT: ; %bb.1: ; %else @@ -2155,20 +2080,11 @@ ; ; GFX10-W32-LABEL: test_nonvoid_return_unreachable: ; GFX10-W32: ; %bb.0: ; %entry -; GFX10-W32-NEXT: s_mov_b32 s4, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s5, s4 -; GFX10-W32-NEXT: s_mov_b32 s6, s4 -; GFX10-W32-NEXT: s_mov_b32 s7, s4 -; GFX10-W32-NEXT: s_mov_b32 s8, s4 -; GFX10-W32-NEXT: s_mov_b32 s9, s4 -; GFX10-W32-NEXT: s_mov_b32 s10, s4 -; GFX10-W32-NEXT: s_mov_b32 s11, s4 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB34_2 ; GFX10-W32-NEXT: ; %bb.1: ; %else @@ -2215,33 +2131,17 @@ ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX9-W64-NEXT: ; %bb.1: ; %else -; GFX9-W64-NEXT: s_mov_b32 s4, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 -; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 ; GFX9-W64-NEXT: s_branch .LBB35_4 ; GFX9-W64-NEXT: .LBB35_2: ; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-W64-NEXT: .LBB35_3: ; %if -; GFX9-W64-NEXT: s_mov_b32 s4, 0 -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: .LBB35_4: ; %end ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 @@ -2252,21 +2152,13 @@ ; GFX10-W32-LABEL: test_scc: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX10-W32-NEXT: ; %bb.1: ; %else ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 ; GFX10-W32-NEXT: s_branch .LBB35_4 @@ -2275,17 +2167,9 @@ ; GFX10-W32-NEXT: .LBB35_3: ; %if ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: .LBB35_4: ; %end -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)