Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -282,6 +282,9 @@ assert(!Fold.needsShrink() && "not handled"); if (Fold.isImm()) { + // FIXME: ChangeToImmediate should probably clear the subreg flags. It's + // reinterpreted as TargetFlags. + Old.setSubReg(0); Old.ChangeToImmediate(Fold.ImmToFold); return true; } Index: llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -26,7 +26,7 @@ --- # GCN-LABEL: name: no_extra_fold_on_same_opnd # The first XOR needs commuting to fold that immediate operand. -# GCN: V_XOR_B32_e32 {{.*}} 0, %1 +# GCN: V_XOR_B32_e32 0, %1 # GCN: V_XOR_B32_e32 %2, %4.sub0 name: no_extra_fold_on_same_opnd tracksRegLiveness: true @@ -40,3 +40,22 @@ %5:vgpr_32 = V_XOR_B32_e32 %1, %4.sub1, implicit $exec %6:vgpr_32 = V_XOR_B32_e32 %2, %4.sub0, implicit $exec ... + +--- + +# Make sure the subreg index is not reinterpreted when folding +# immediates +# +# GCN-LABEL: name: clear_subreg_imm_fold{{$}} +# GCN: %1:sgpr_32 = S_MOV_B32 4294967288 +# GCN: %2:sgpr_32 = S_MOV_B32 4294967295 +name: clear_subreg_imm_fold +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_64 = S_MOV_B64 -8 + %1:sgpr_32 = COPY %0.sub0 + %2:sgpr_32 = COPY %0.sub1 + S_ENDPGM 0, implicit %1, implicit %2 + +... Index: llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -6,10 +6,10 @@ ; SI-LABEL: widen_i16_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -43,10 +43,10 @@ ; SI-LABEL: widen_i16_constant_load_zext_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -83,10 +83,10 @@ ; SI-LABEL: widen_i16_constant_load_sext_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -122,23 +122,25 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { ; SI-LABEL: widen_i17_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s0, s0, 34 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_bfe_u32 s0, s0, 0x10010 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_load_dword s7, s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s4, 2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s5, s0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s7, s7, 34 +; SI-NEXT: s_or_b32 s7, s7, 4 +; SI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i17_constant_load: @@ -174,10 +176,10 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -205,10 +207,10 @@ ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -260,13 +262,14 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s1, 0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3e7, v0 ; SI-NEXT: v_or_b32_e32 v0, 4, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: no_widen_i16_constant_divergent_load: @@ -299,10 +302,10 @@ ; SI-LABEL: widen_i1_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -333,10 +336,10 @@ ; SI-LABEL: widen_i16_zextload_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -373,10 +376,10 @@ ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,17 +418,16 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_or_b32 s4, s0, 4 +; SI-NEXT: s_mov_b32 s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant32_load: @@ -453,10 +455,10 @@ ; SI-LABEL: widen_i16_global_invariant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0)