Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -513,18 +513,6 @@ if (UseOp.isReg() && OpToFold.isReg()) { if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) return; - - // Don't fold subregister extracts into tied operands, only if it is a full - // copy since a subregister use tied to a full register def doesn't really - // make sense. e.g. don't fold: - // - // %1 = COPY %0:sub1 - // %2 = V_MAC_{F16, F32} %3, %4, %1 - // - // into - // %2 = V_MAC_{F16, F32} %3, %4, %0:sub1 - if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) - return; } // Special case for REG_SEQUENCE: We can't fold literals into Index: llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -59,7 +59,7 @@ ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32: ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| ; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} -; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], 2.0, |[[X]]|, v{{[0-9]+}} +; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm @@ -76,8 +76,8 @@ ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32: ; SIVI: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} ; SIVI: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} -; GFX10: v_fma_f32 {{v[0-9]+}}, 2.0, |[[X:s[0-9]+]]|, {{v[0-9]+}} -; GFX10: v_fma_f32 {{v[0-9]+}}, 2.0, |[[X]]|, {{v[0-9]+}} +; GFX10: v_fma_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, {{s[0-9]+}} +; GFX10: v_fma_f32 {{v[0-9]+}}, |[[X]]|, 2.0, {{s[0-9]+}} define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) Index: llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir +++ llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir @@ -37,13 +37,9 @@ DS_WRITE2_B32_gfx9 %2, killed %4, killed %3, 0, 1, 0, implicit $exec ... -# FIXME: src2 of fmac should use scalar register. -# However, after %0.sub1 is folded into %3 COPY it is not considered for folding anymore. - # GCN-LABEL: name: fma_sgpr_use # GCN: %0:sreg_64_xexec = IMPLICIT_DEF -# GCN-NEXT: %3:vgpr_32 = COPY %0.sub1 -# GCN-NEXT: %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMAC_F32_e64 2, %0.sub0, 0, 1073741824, 0, %3, 0, 0, implicit $exec +# GCN-NEXT: %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMA_F32 2, %0.sub0, 0, 1073741824, 0, %0.sub1, 0, 0, implicit $exec --- name: fma_sgpr_use body: |