Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -639,10 +639,11 @@ CopiesToReplace.push_back(UseMI); } else { if (UseMI->isCopy() && OpToFold.isReg() && - Register::isVirtualRegister(UseMI->getOperand(0).getReg()) && + UseMI->getOperand(0).getReg().isVirtual() && TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { + LLVM_DEBUG(dbgs() << "Folding " << OpToFold + << "\n into " << *UseMI << '\n'); unsigned Size = TII->getOpSize(*UseMI, 1); UseMI->getOperand(1).setReg(OpToFold.getReg()); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); Index: llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -59,7 +59,7 @@ ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32: ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| ; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} -; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} +; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], 2.0, |[[X]]|, v{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm @@ -76,8 +76,8 @@ ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32: ; SIVI: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} ; SIVI: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} -; GFX10: v_fma_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, {{s[0-9]+}} -; GFX10: v_fma_f32 {{v[0-9]+}}, |[[X]]|, 2.0, {{s[0-9]+}} +; GFX10: v_fma_f32 {{v[0-9]+}}, 2.0, |[[X:s[0-9]+]]|, {{v[0-9]+}} +; GFX10: v_fma_f32 {{v[0-9]+}}, 2.0, |[[X]]|, {{v[0-9]+}} define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) Index: llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir +++ llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir @@ -1,27 +1,57 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-fold-operands,dead-mi-elimination %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-fold-operands,dead-mi-elimination -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s -# GCN-LABEL: name: fold_vgpr_copy +# GCN-LABEL: name: fold_vgpr_to_vgpr_copy # GCN: %0:vreg_64 = IMPLICIT_DEF -# GCN-NEXT: %4:vgpr_32 = IMPLICIT_DEF -# GCN-NEXT: %3:vgpr_32 = IMPLICIT_DEF -# GCN-NEXT: DS_WRITE2_B32_gfx9 %0.sub0, killed %4, killed %3, 0, 1, 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %2:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: DS_WRITE2_B32_gfx9 %0.sub0, killed %1, killed %2, 0, 1, 0, implicit $exec --- -name: fold_vgpr_copy -registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vgpr_32 } - - { id: 2, class: vgpr_32 } - - { id: 3, class: vgpr_32 } - - { id: 4, class: vgpr_32 } +name: fold_vgpr_to_vgpr_copy body: | bb.0: %0:vreg_64 = IMPLICIT_DEF - %4 = IMPLICIT_DEF - %3 = IMPLICIT_DEF + %4:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = COPY %0.sub0 %2:vgpr_32 = COPY %1 DS_WRITE2_B32_gfx9 %2, killed %4, killed %3, 0, 1, 0, implicit $exec +... + +# GCN-LABEL: name: fold_sgpr_to_vgpr_copy +# GCN: %0:sreg_64 = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %2:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %4:vgpr_32 = COPY %0.sub0 +# GCN-NEXT: DS_WRITE2_B32_gfx9 %4, killed %1, killed %2, 0, 1, 0, implicit $exec +name: fold_sgpr_to_vgpr_copy +body: | + bb.0: + + %0:sreg_64 = IMPLICIT_DEF + %4:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %1:sgpr_32 = COPY %0.sub0 + %2:vgpr_32 = COPY %1 + DS_WRITE2_B32_gfx9 %2, killed %4, killed %3, 0, 1, 0, implicit $exec +... + +# FIXME: src2 of fmac should use scalar register. +# However, after %0.sub1 is folded into %3 COPY it is not considered for folding anymore. +# GCN-LABEL: name: fma_sgpr_use +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %3:vgpr_32 = COPY %0.sub1 +# GCN-NEXT: %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMAC_F32_e64 2, %0.sub0, 0, 1073741824, 0, %3, 0, 0, implicit $exec +--- +name: fma_sgpr_use +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:sgpr_32 = COPY %0.sub0 + %2:sgpr_32 = COPY %0.sub1 + %3:vgpr_32 = COPY %2 + %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMAC_F32_e64 2, %1, 0, 1073741824, 0, %3, 0, 0, implicit $exec + DS_WRITE2_B32_gfx9 undef %5:vgpr_32, killed %4, undef %6:vgpr_32, 0, 1, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/saddo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/saddo.ll +++ llvm/test/CodeGen/AMDGPU/saddo.ll @@ -40,15 +40,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_add_u32 s8, s6, s0 -; VI-NEXT: s_addc_u32 s9, s7, s1 +; VI-NEXT: s_add_u32 s2, s6, s0 +; VI-NEXT: s_addc_u32 s3, s7, s1 ; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2] -; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -61,15 +61,15 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_add_u32 s8, s6, s0 -; GFX9-NEXT: s_addc_u32 s9, s7, s1 +; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_addc_u32 s3, s7, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc