Index: lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- lib/Target/AMDGPU/SIFoldOperands.cpp
+++ lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -505,17 +505,15 @@
     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
       return;
 
-    // Don't fold subregister extracts into tied operands, only if it is a full
-    // copy since a subregister use tied to a full register def doesn't really
-    // make sense. e.g. don't fold:
-    //
-    // %1 = COPY %0:sub1
-    // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
-    //
-    //  into
-    // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
-    if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
-      return;
+    // Allow folding subregister extract into tied operands for
+    // v_mac and v_fmac opcodes only
+    if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) {
+      unsigned UseOpc = UseOp.getParent()->getOpcode();
+      if (UseOpc != AMDGPU::V_MAC_F32_e64 &&
+          UseOpc != AMDGPU::V_MAC_F16_e64 &&
+          UseOpc != AMDGPU::V_FMAC_F32_e64)
+        return;
+    }
   }
 
   // Special case for REG_SEQUENCE: We can't fold literals into
Index: test/CodeGen/AMDGPU/fmac-fma-sgpr-copy.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/fmac-fma-sgpr-copy.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s
+
+; CHECK: v_fma_f32 v0, v1, v0, s0
+define amdgpu_cs float @test1(<4 x i32> inreg %a, float %b, float %y) {
+entry:
+  %buf.load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %a, i32 0, i32 0)
+  %vec1 = bitcast <4 x i32> %buf.load to <4 x float>
+  %.i095 = extractelement <4 x float> %vec1, i32 0
+  %.i098 = fsub nnan arcp float %b, %.i095
+  %fma1 = call float @llvm.fma.f32(float %y, float %.i098, float %.i095) #3
+  ret float %fma1
+}
+
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) #2
+declare float @llvm.fma.f32(float, float, float) #1
+
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
===================================================================
--- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -59,7 +59,7 @@
 ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32:
 ; GCN-DAG:   v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}|
 ; SIVI-DAG:  v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
-; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], 2.0, |[[X]]|, v{{[0-9]+}}
+; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}}
 ; GCN-DAG:   buffer_store_dword [[MUL2]]
 ; GCN-DAG:   buffer_store_dword [[MAD]]
 ; GCN:       s_endpgm