diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4851,37 +4851,39 @@ if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) return false; - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; unsigned PreferredFusedOpcode = HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } // fold (fadd (fmul x, y), z) -> (fma x, y, z) - if (isContractableFMul(*LHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()))) { + if (isContractableFMul(*LHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), - RHS->getOperand(0).getReg()}); + {LHS.MI->getOperand(1).getReg(), + LHS.MI->getOperand(2).getReg(), RHS.Reg}); }; return true; } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) - if (isContractableFMul(*RHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()))) { + if (isContractableFMul(*RHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {RHS->getOperand(1).getReg(), RHS->getOperand(2).getReg(), - LHS->getOperand(0).getReg()}); + {RHS.MI->getOperand(1).getReg(), + RHS.MI->getOperand(2).getReg(), LHS.Reg}); }; return true; } @@ -4898,8 +4900,10 @@ return false; const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; LLT DstType = MRI.getType(MI.getOperand(0).getReg()); unsigned PreferredFusedOpcode = @@ -4907,42 +4911,38 @@ // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) MachineInstr *FpExtSrc; - if (mi_match(LHS->getOperand(0).getReg(), MRI, - m_GFPExt(m_MInstr(FpExtSrc))) && + if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg()); auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg()); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {FpExtX.getReg(0), FpExtY.getReg(0), RHS->getOperand(0).getReg()}); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FpExtX.getReg(0), FpExtY.getReg(0), RHS.Reg}); }; return true; } // fold (fadd z, (fpext (fmul x, y))) -> (fma (fpext x), (fpext y), z) // Note: Commutes FADD operands. - if (mi_match(RHS->getOperand(0).getReg(), MRI, - m_GFPExt(m_MInstr(FpExtSrc))) && + if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg()); auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg()); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {FpExtX.getReg(0), FpExtY.getReg(0), LHS->getOperand(0).getReg()}); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FpExtX.getReg(0), FpExtY.getReg(0), LHS.Reg}); }; return true; } @@ -4958,8 +4958,10 @@ if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive, true)) return false; - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); unsigned PreferredFusedOpcode = @@ -4967,31 +4969,31 @@ // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } MachineInstr *FMA = nullptr; Register Z; // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z)) - if (LHS->getOpcode() == PreferredFusedOpcode && - (MRI.getVRegDef(LHS->getOperand(3).getReg())->getOpcode() == + if (LHS.MI->getOpcode() == PreferredFusedOpcode && + (MRI.getVRegDef(LHS.MI->getOperand(3).getReg())->getOpcode() == TargetOpcode::G_FMUL) && - MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()) && - MRI.hasOneNonDBGUse(LHS->getOperand(3).getReg())) { - FMA = LHS; - Z = RHS->getOperand(0).getReg(); + MRI.hasOneNonDBGUse(LHS.MI->getOperand(0).getReg()) && + MRI.hasOneNonDBGUse(LHS.MI->getOperand(3).getReg())) { + FMA = LHS.MI; + Z = RHS.Reg; } // fold (fadd z, (fma x, y, (fmul u, v))) -> (fma x, y, (fma u, v, z)) - else if (RHS->getOpcode() == PreferredFusedOpcode && - (MRI.getVRegDef(RHS->getOperand(3).getReg())->getOpcode() == + else if (RHS.MI->getOpcode() == PreferredFusedOpcode && + (MRI.getVRegDef(RHS.MI->getOperand(3).getReg())->getOpcode() == TargetOpcode::G_FMUL) && - MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()) && - MRI.hasOneNonDBGUse(RHS->getOperand(3).getReg())) { - Z = LHS->getOperand(0).getReg(); - FMA = RHS; + MRI.hasOneNonDBGUse(RHS.MI->getOperand(0).getReg()) && + MRI.hasOneNonDBGUse(RHS.MI->getOperand(3).getReg())) { + Z = LHS.Reg; + FMA = RHS.MI; } if (FMA) { @@ -5026,17 +5028,19 @@ const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); LLT DstType = MRI.getType(MI.getOperand(0).getReg()); - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; unsigned PreferredFusedOpcode = HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } @@ -5055,16 +5059,17 @@ MachineInstr *FMulMI, *FMAMI; // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) - if (LHS->getOpcode() == PreferredFusedOpcode && - mi_match(LHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && + if (LHS.MI->getOpcode() == PreferredFusedOpcode && + mi_match(LHS.MI->getOperand(3).getReg(), MRI, + m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - RHS->getOperand(0).getReg(), LHS->getOperand(1).getReg(), - LHS->getOperand(2).getReg(), B); + FMulMI->getOperand(2).getReg(), RHS.Reg, + LHS.MI->getOperand(1).getReg(), + LHS.MI->getOperand(2).getReg(), B); }; return true; } @@ -5074,7 +5079,7 @@ // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - if (mi_match(LHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) && + if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) && FMAMI->getOpcode() == PreferredFusedOpcode) { MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); if (isContractableFMul(*FMulMI, AllowFusionGlobally) && @@ -5086,8 +5091,7 @@ X = B.buildFPExt(DstType, X).getReg(0); Y = B.buildFPExt(DstType, Y).getReg(0); buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - RHS->getOperand(0).getReg(), X, Y, B); + FMulMI->getOperand(2).getReg(), RHS.Reg, X, Y, B); }; return true; @@ -5096,16 +5100,17 @@ // fold (fadd z, (fma x, y, (fpext (fmul u, v))) // -> (fma x, y, (fma (fpext u), (fpext v), z)) - if (RHS->getOpcode() == PreferredFusedOpcode && - mi_match(RHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && + if (RHS.MI->getOpcode() == PreferredFusedOpcode && + mi_match(RHS.MI->getOperand(3).getReg(), MRI, + m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - LHS->getOperand(0).getReg(), RHS->getOperand(1).getReg(), - RHS->getOperand(2).getReg(), B); + FMulMI->getOperand(2).getReg(), LHS.Reg, + RHS.MI->getOperand(1).getReg(), + RHS.MI->getOperand(2).getReg(), B); }; return true; } @@ -5115,7 +5120,7 @@ // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - if (mi_match(RHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) && + if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) && FMAMI->getOpcode() == PreferredFusedOpcode) { MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); if (isContractableFMul(*FMulMI, AllowFusionGlobally) && @@ -5127,8 +5132,7 @@ X = B.buildFPExt(DstType, X).getReg(0); Y = B.buildFPExt(DstType, Y).getReg(0); buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - LHS->getOperand(0).getReg(), X, Y, B); + FMulMI->getOperand(2).getReg(), LHS.Reg, X, Y, B); }; return true; } @@ -5145,16 +5149,18 @@ if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) return false; - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. int FirstMulHasFewerUses = true; - if (isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally) && - hasMoreUses(*LHS, *RHS, MRI)) + if (isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally) && + hasMoreUses(*LHS.MI, *RHS.MI, MRI)) FirstMulHasFewerUses = false; unsigned PreferredFusedOpcode = @@ -5162,24 +5168,24 @@ // fold (fsub (fmul x, y), z) -> (fma x, y, -z) if (FirstMulHasFewerUses && - (isContractableFMul(*LHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg())))) { + (isContractableFMul(*LHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { - Register NegZ = B.buildFNeg(DstTy, RHS->getOperand(0).getReg()).getReg(0); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), NegZ}); + Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {LHS.MI->getOperand(1).getReg(), + LHS.MI->getOperand(2).getReg(), NegZ}); }; return true; } // fold (fsub x, (fmul y, z)) -> (fma -y, z, x) - else if ((isContractableFMul(*RHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg())))) { + else if ((isContractableFMul(*RHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { - Register NegY = B.buildFNeg(DstTy, RHS->getOperand(1).getReg()).getReg(0); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {NegY, RHS->getOperand(2).getReg(), LHS->getOperand(0).getReg()}); + Register NegY = + B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {NegY, RHS.MI->getOperand(2).getReg(), LHS.Reg}); }; return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir @@ -218,9 +218,8 @@ ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) ; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -240,9 +239,8 @@ ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) ; GFX10-LABEL: name: test_add_mul_multiple_defs_z ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -262,9 +260,8 @@ ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) ; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -284,9 +281,8 @@ ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -325,9 +321,8 @@ ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) ; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -347,9 +342,8 @@ ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) ; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -369,9 +363,8 @@ ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) ; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -391,9 +384,8 @@ ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]] + ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -149,11 +149,10 @@ ; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5 +; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z: @@ -188,11 +187,10 @@ ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5 +; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z: @@ -233,11 +231,10 @@ ; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5 +; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z: @@ -272,11 +269,10 @@ ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5 +; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir @@ -14,9 +14,8 @@ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el0 + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -42,9 +41,8 @@ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el0 + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -72,11 +70,10 @@ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s16) = G_TRUNC %0(s32) @@ -107,11 +104,10 @@ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s16) = G_TRUNC %0(s32) @@ -139,9 +135,8 @@ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el0 + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) %0:_(s32) = COPY $vgpr0 @@ -170,9 +165,8 @@ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el0 + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) %0:_(s32) = COPY $vgpr0 @@ -202,15 +196,14 @@ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) %0:_(s32) = COPY $vgpr0 @@ -245,8 +238,7 @@ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5 @@ -255,7 +247,7 @@ ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el0 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) %0:_(s32) = COPY $vgpr0 @@ -289,8 +281,7 @@ ; GFX10-LABEL: name: test_f16_f32_add_fma_ext_mul_rhs ; GFX10: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 @@ -299,7 +290,7 @@ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) %ptr:_(p1) = COPY $vgpr0_vgpr1 @@ -330,8 +321,7 @@ ; GFX10-LABEL: name: test_f16_f32_add_ext_fma_mul_rhs ; GFX10: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 @@ -344,7 +334,7 @@ ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el0 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) %ptr:_(p1) = COPY $vgpr0_vgpr1 @@ -380,9 +370,8 @@ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) - ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %el0 + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %el1 ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FNEG]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 @@ -409,10 +398,9 @@ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>) - ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] - ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[COPY1]], %el0 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[COPY1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1