Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -505,17 +505,15 @@ if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) return; - // Don't fold subregister extracts into tied operands, only if it is a full - // copy since a subregister use tied to a full register def doesn't really - // make sense. e.g. don't fold: - // - // %1 = COPY %0:sub1 - // %2 = V_MAC_{F16, F32} %3, %4, %1 - // - // into - // %2 = V_MAC_{F16, F32} %3, %4, %0:sub1 - if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) - return; + // Allow folding subregister extract into tied operands for + // v_mac and v_fmac opcodes only + if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) { + unsigned UseOpc = UseOp.getParent()->getOpcode(); + if (UseOpc != AMDGPU::V_MAC_F32_e64 && + UseOpc != AMDGPU::V_MAC_F16_e64 && + UseOpc != AMDGPU::V_FMAC_F32_e64) + return; + } } // Special case for REG_SEQUENCE: We can't fold literals into Index: test/CodeGen/AMDGPU/fmac-fma-sgpr-copy.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmac-fma-sgpr-copy.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s + +; CHECK: v_fma_f32 v0, v1, v0, s0 +define amdgpu_cs float @test1(<4 x i32> inreg %a, float %b, float %y) { +entry: + %buf.load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %a, i32 0, i32 0) + %vec1 = bitcast <4 x i32> %buf.load to <4 x float> + %.i095 = extractelement <4 x float> %vec1, i32 0 + %.i098 = fsub nnan arcp float %b, %.i095 + %fma1 = call float @llvm.fma.f32(float %y, float %.i098, float %.i095) #3 + ret float %fma1 +} + +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) #2 +declare float @llvm.fma.f32(float, float, float) #1 + +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind readnone } +attributes #3 = { nounwind } Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -59,7 +59,7 @@ ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32: ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| ; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} -; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], 2.0, |[[X]]|, v{{[0-9]+}} +; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm