diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1839,6 +1839,9 @@ } if (isTypeLegal(MVT::i64)) { + // The algorithm here is based on ideas from "Software Integer Division", + // Tom Rodeheffer, August 2008. + MachineFunction &MF = DAG.getMachineFunction(); const SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -1873,37 +1876,35 @@ SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); + // First round of UNR (Unsigned integer Newton-Raphson). SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, Zero); - SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, - One); - + SDValue Mulhi1_Hi = + DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One); SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo, Mulhi1_Lo, Zero1); SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi, Mulhi1_Hi, Add1_Lo.getValue(1)); - SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi); SDValue Add1 = DAG.getBitcast(VT, DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); + // Second round of UNR. SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, Zero); - SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, - One); - + SDValue Mulhi2_Hi = + DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One); SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo, Mulhi2_Lo, Zero1); - SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc, - Mulhi2_Hi, Add1_Lo.getValue(1)); - SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC, - Zero, Add2_Lo.getValue(1)); + SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi, + Mulhi2_Hi, Add2_Lo.getValue(1)); SDValue Add2 = DAG.getBitcast(VT, DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); + SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3051,7 +3051,6 @@ auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); - auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); auto MulLo2 = B.buildMul(S64, NegDenom, Add1); @@ -3062,9 +3061,7 @@ auto Zero32 = B.buildConstant(S32, 0); auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); - auto Add2_HiC = - B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); - auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); + auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); auto UnmergeNumer = B.buildUnmerge(S32, Numer); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir @@ -415,72 +415,70 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -494,8 +492,8 @@ ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]] - ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]] - ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]] + ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]] ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]] @@ -504,8 +502,8 @@ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]] - ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]] - ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]] + ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -587,72 +585,70 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -666,8 +662,8 @@ ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]] - ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]] - ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]] + ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]] ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]] @@ -676,8 +672,8 @@ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]] - ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]] - ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]] + ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -759,72 +755,70 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -838,8 +832,8 @@ ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]] - ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]] - ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]] + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]] ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]] @@ -848,8 +842,8 @@ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]] - ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]] - ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]] + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -945,72 +939,70 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]] ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]] ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]] ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -1024,8 +1016,8 @@ ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX6-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV26]] - ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV27]], [[UADDO39]] - ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV27]], [[UADDO39]] + ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV25]] ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV24]] @@ -1034,8 +1026,8 @@ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX6-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV28]] - ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV29]], [[UADDO41]] - ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV29]], [[UADDO41]] + ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -1052,13 +1044,13 @@ ; GFX6-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64) ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]] - ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]] - ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE14]](s32) + ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]] + ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE12]](s32) ; GFX6-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64) ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UV38]], [[UV40]] - ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]] - ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE16]](s32) + ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]] + ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE14]](s32) ; GFX6-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]] ; GFX6-NEXT: [[XOR5:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]] ; GFX6-NEXT: [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) @@ -1086,96 +1078,94 @@ ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[FPTOUI2]] ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]] ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]] - ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH16]] ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH18]] ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD21]] + ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD20]] ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) - ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO54]] - ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO57]] - ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO57]] ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO56]] ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[UADDO56]] - ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE18]] + ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE16]] ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO56]] - ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[MUL24]] - ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD27]] + ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[MUL24]] + ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD25]] ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[MUL24]] ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH21]] ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[ADD27]] - ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[MUL24]] - ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD27]] + ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[ADD25]] + ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[MUL24]] + ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD25]] ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH23]] ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD28]] + ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD26]] ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) - ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[ADD27]] - ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[ADD25]] + ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX6-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[UADDO66]] - ; GFX6-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO57]] - ; GFX6-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[C6]], [[UADDO69]] + ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[ADD29]], [[UADDO69]] ; GFX6-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX6-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDO68]] - ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE22]] + ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE18]] ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDO68]] ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) ; GFX6-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH25]] ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1) - ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE22]] + ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE18]] ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDO68]] - ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE22]] + ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE18]] ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1) ; GFX6-NEXT: [[UADDO76:%[0-9]+]]:_(s32), [[UADDO77:%[0-9]+]]:_(s1) = G_UADDO [[UADDO74]], [[UMULH27]] ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO77]](s1) - ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX6-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD32]] + ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX6-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD30]] ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO79]](s1) - ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE22]] - ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] - ; GFX6-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD35]](s32) + ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE18]] + ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] + ; GFX6-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD33]](s32) ; GFX6-NEXT: [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[UADDO78]] ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV53]], [[UADDO78]] - ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD35]] + ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD33]] ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV52]], [[UADDO78]] - ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX6-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV48]], [[MUL33]] - ; GFX6-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD37]], [[USUBO11]] - ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD37]] + ; GFX6-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD35]], [[USUBO11]] + ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD35]] ; GFX6-NEXT: [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE12]](s32), [[UV55]] ; GFX6-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1) @@ -1188,8 +1178,8 @@ ; GFX6-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[USUBE14]], [[C6]], [[USUBO13]] ; GFX6-NEXT: [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO80:%[0-9]+]]:_(s32), [[UADDO81:%[0-9]+]]:_(s1) = G_UADDO [[UADDO78]], [[UV56]] - ; GFX6-NEXT: [[UADDE24:%[0-9]+]]:_(s32), [[UADDE25:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV57]], [[UADDO81]] - ; GFX6-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE24]](s32) + ; GFX6-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV57]], [[UADDO81]] + ; GFX6-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE20]](s32) ; GFX6-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV55]] ; GFX6-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1) ; GFX6-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO12]](s32), [[UV54]] @@ -1198,8 +1188,8 @@ ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]] ; GFX6-NEXT: [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO82:%[0-9]+]]:_(s32), [[UADDO83:%[0-9]+]]:_(s1) = G_UADDO [[UADDO80]], [[UV58]] - ; GFX6-NEXT: [[UADDE26:%[0-9]+]]:_(s32), [[UADDE27:%[0-9]+]]:_(s1) = G_UADDE [[UADDE24]], [[UV59]], [[UADDO83]] - ; GFX6-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE26]](s32) + ; GFX6-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[UV59]], [[UADDO83]] + ; GFX6-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE22]](s32) ; GFX6-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C6]] ; GFX6-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV10]], [[MV9]] ; GFX6-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C6]] @@ -1284,72 +1274,70 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]] ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]] ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]] ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -1363,8 +1351,8 @@ ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX8-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV26]] - ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV27]], [[UADDO39]] - ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV27]], [[UADDO39]] + ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV25]] ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV24]] @@ -1373,8 +1361,8 @@ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX8-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV28]] - ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV29]], [[UADDO41]] - ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV29]], [[UADDO41]] + ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -1391,13 +1379,13 @@ ; GFX8-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64) ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]] - ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]] - ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE14]](s32) + ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]] + ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE12]](s32) ; GFX8-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64) ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UV38]], [[UV40]] - ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]] - ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE16]](s32) + ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]] + ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE14]](s32) ; GFX8-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]] ; GFX8-NEXT: [[XOR5:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]] ; GFX8-NEXT: [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) @@ -1425,96 +1413,94 @@ ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[FPTOUI2]] ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]] ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]] - ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH16]] ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH18]] ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD21]] + ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD20]] ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) - ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO54]] - ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO57]] - ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO57]] ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO56]] ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[UADDO56]] - ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE18]] + ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE16]] ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO56]] - ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[MUL24]] - ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD27]] + ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[MUL24]] + ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD25]] ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[MUL24]] ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH21]] ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[ADD27]] - ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[MUL24]] - ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD27]] + ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[ADD25]] + ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[MUL24]] + ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD25]] ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH23]] ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD28]] + ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD26]] ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) - ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[ADD27]] - ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[ADD25]] + ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX8-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[UADDO66]] - ; GFX8-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO57]] - ; GFX8-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[C6]], [[UADDO69]] + ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[ADD29]], [[UADDO69]] ; GFX8-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX8-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDO68]] - ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE22]] + ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE18]] ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDO68]] ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) ; GFX8-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH25]] ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1) - ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE22]] + ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE18]] ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDO68]] - ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE22]] + ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE18]] ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1) ; GFX8-NEXT: [[UADDO76:%[0-9]+]]:_(s32), [[UADDO77:%[0-9]+]]:_(s1) = G_UADDO [[UADDO74]], [[UMULH27]] ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO77]](s1) - ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX8-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD32]] + ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX8-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD30]] ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO79]](s1) - ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE22]] - ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] - ; GFX8-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD35]](s32) + ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE18]] + ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] + ; GFX8-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD33]](s32) ; GFX8-NEXT: [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[UADDO78]] ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV53]], [[UADDO78]] - ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD35]] + ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD33]] ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV52]], [[UADDO78]] - ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX8-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV48]], [[MUL33]] - ; GFX8-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD37]], [[USUBO11]] - ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD37]] + ; GFX8-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD35]], [[USUBO11]] + ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD35]] ; GFX8-NEXT: [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE12]](s32), [[UV55]] ; GFX8-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1) @@ -1527,8 +1513,8 @@ ; GFX8-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[USUBE14]], [[C6]], [[USUBO13]] ; GFX8-NEXT: [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO80:%[0-9]+]]:_(s32), [[UADDO81:%[0-9]+]]:_(s1) = G_UADDO [[UADDO78]], [[UV56]] - ; GFX8-NEXT: [[UADDE24:%[0-9]+]]:_(s32), [[UADDE25:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV57]], [[UADDO81]] - ; GFX8-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE24]](s32) + ; GFX8-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV57]], [[UADDO81]] + ; GFX8-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE20]](s32) ; GFX8-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV55]] ; GFX8-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1) ; GFX8-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO12]](s32), [[UV54]] @@ -1537,8 +1523,8 @@ ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]] ; GFX8-NEXT: [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO82:%[0-9]+]]:_(s32), [[UADDO83:%[0-9]+]]:_(s1) = G_UADDO [[UADDO80]], [[UV58]] - ; GFX8-NEXT: [[UADDE26:%[0-9]+]]:_(s32), [[UADDE27:%[0-9]+]]:_(s1) = G_UADDE [[UADDE24]], [[UV59]], [[UADDO83]] - ; GFX8-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE26]](s32) + ; GFX8-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[UV59]], [[UADDO83]] + ; GFX8-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE22]](s32) ; GFX8-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C6]] ; GFX8-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV10]], [[MV9]] ; GFX8-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C6]] @@ -1623,72 +1609,70 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]] ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]] ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]] ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -1702,8 +1686,8 @@ ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX9-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV26]] - ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV27]], [[UADDO39]] - ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV27]], [[UADDO39]] + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV25]] ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV24]] @@ -1712,8 +1696,8 @@ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV28]] - ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV29]], [[UADDO41]] - ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV29]], [[UADDO41]] + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -1730,13 +1714,13 @@ ; GFX9-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64) ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]] - ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]] - ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE14]](s32) + ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]] + ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE12]](s32) ; GFX9-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64) ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UV38]], [[UV40]] - ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]] - ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE16]](s32) + ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]] + ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE14]](s32) ; GFX9-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]] ; GFX9-NEXT: [[XOR5:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]] ; GFX9-NEXT: [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) @@ -1764,96 +1748,94 @@ ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[FPTOUI2]] ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]] ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]] - ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH16]] ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH18]] ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD21]] + ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD20]] ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) - ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO54]] - ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO57]] - ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO57]] ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO56]] ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[UADDO56]] - ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE18]] + ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE16]] ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO56]] - ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[MUL24]] - ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD27]] + ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[MUL24]] + ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD25]] ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[MUL24]] ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH21]] ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[ADD27]] - ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[MUL24]] - ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD27]] + ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[ADD25]] + ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[MUL24]] + ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD25]] ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH23]] ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD28]] + ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD26]] ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) - ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[ADD27]] - ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[ADD25]] + ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX9-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[UADDO66]] - ; GFX9-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO57]] - ; GFX9-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[C6]], [[UADDO69]] + ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[ADD29]], [[UADDO69]] ; GFX9-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX9-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDO68]] - ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE22]] + ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE18]] ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDO68]] ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) ; GFX9-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH25]] ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1) - ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE22]] + ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE18]] ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDO68]] - ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE22]] + ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE18]] ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1) ; GFX9-NEXT: [[UADDO76:%[0-9]+]]:_(s32), [[UADDO77:%[0-9]+]]:_(s1) = G_UADDO [[UADDO74]], [[UMULH27]] ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO77]](s1) - ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX9-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD32]] + ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX9-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD30]] ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO79]](s1) - ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE22]] - ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] - ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD35]](s32) + ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE18]] + ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] + ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD33]](s32) ; GFX9-NEXT: [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[UADDO78]] ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV53]], [[UADDO78]] - ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD35]] + ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD33]] ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV52]], [[UADDO78]] - ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX9-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV48]], [[MUL33]] - ; GFX9-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD37]], [[USUBO11]] - ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD37]] + ; GFX9-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD35]], [[USUBO11]] + ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD35]] ; GFX9-NEXT: [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64) ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE12]](s32), [[UV55]] ; GFX9-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1) @@ -1866,8 +1848,8 @@ ; GFX9-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[USUBE14]], [[C6]], [[USUBO13]] ; GFX9-NEXT: [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO80:%[0-9]+]]:_(s32), [[UADDO81:%[0-9]+]]:_(s1) = G_UADDO [[UADDO78]], [[UV56]] - ; GFX9-NEXT: [[UADDE24:%[0-9]+]]:_(s32), [[UADDE25:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV57]], [[UADDO81]] - ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE24]](s32) + ; GFX9-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV57]], [[UADDO81]] + ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE20]](s32) ; GFX9-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV55]] ; GFX9-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1) ; GFX9-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO12]](s32), [[UV54]] @@ -1876,8 +1858,8 @@ ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]] ; GFX9-NEXT: [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO82:%[0-9]+]]:_(s32), [[UADDO83:%[0-9]+]]:_(s1) = G_UADDO [[UADDO80]], [[UV58]] - ; GFX9-NEXT: [[UADDE26:%[0-9]+]]:_(s32), [[UADDE27:%[0-9]+]]:_(s1) = G_UADDE [[UADDE24]], [[UV59]], [[UADDO83]] - ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE26]](s32) + ; GFX9-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[UV59]], [[UADDO83]] + ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE22]](s32) ; GFX9-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C6]] ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV10]], [[MV9]] ; GFX9-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C6]] @@ -2608,72 +2590,70 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -2687,8 +2667,8 @@ ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]] - ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]] - ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]] + ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]] ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]] @@ -2697,8 +2677,8 @@ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]] - ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]] - ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]] + ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -2782,72 +2762,70 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -2861,8 +2839,8 @@ ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]] - ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]] - ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]] + ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]] ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]] @@ -2871,8 +2849,8 @@ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]] - ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]] - ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]] + ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -2956,72 +2934,70 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32) + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -3035,8 +3011,8 @@ ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]] - ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]] - ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]] + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]] ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]] @@ -3045,8 +3021,8 @@ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]] - ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]] - ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]] + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]] ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir @@ -382,71 +382,69 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] @@ -549,71 +547,69 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] @@ -716,71 +712,69 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] @@ -897,71 +891,69 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]] ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]] ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]] @@ -999,13 +991,13 @@ ; GFX6-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64) ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UV30]], [[UV32]] - ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]] - ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]] + ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX6-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64) ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]] - ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]] - ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]] + ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX6-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]] ; GFX6-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]] ; GFX6-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) @@ -1033,95 +1025,93 @@ ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[FPTOUI2]] ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[FPTOUI3]] ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[FPTOUI2]] - ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH16]] ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1) - ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH18]] ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD21]] + ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD20]] ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) - ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO50]] - ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO53]] - ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO53]] ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDO52]] ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[UADDO52]] - ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE14]] + ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE12]] ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[UADDO52]] - ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[MUL24]] - ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD27]] + ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[MUL24]] + ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD25]] ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[MUL24]] ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH21]] ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1) - ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[ADD27]] - ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[MUL24]] - ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD27]] + ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[ADD25]] + ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[MUL24]] + ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD25]] ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH23]] ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD28]] + ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD26]] ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) - ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[ADD27]] - ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[ADD25]] + ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[UADDO62]] - ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO53]] - ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[C6]], [[UADDO65]] + ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[ADD29]], [[UADDO65]] ; GFX6-NEXT: [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64) ; GFX6-NEXT: [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64) ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDO64]] - ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE18]] + ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE14]] ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDO64]] ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) ; GFX6-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH25]] ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1) - ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE18]] + ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE14]] ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDO64]] - ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE18]] + ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE14]] ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) ; GFX6-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH27]] ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1) - ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD32]] + ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD30]] ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1) - ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE18]] - ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] + ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE14]] + ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] ; GFX6-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[UADDO74]] ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV49]], [[UADDO74]] - ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD35]] + ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD33]] ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV48]], [[UADDO74]] - ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX6-NEXT: [[USUBO12:%[0-9]+]]:_(s32), [[USUBO13:%[0-9]+]]:_(s1) = G_USUBO [[UV44]], [[MUL33]] - ; GFX6-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD37]], [[USUBO13]] - ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD37]] + ; GFX6-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD35]], [[USUBO13]] + ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD35]] ; GFX6-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO12]](s32), [[USUBE16]](s32) ; GFX6-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV51]] @@ -1227,71 +1217,69 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]] ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]] ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]] @@ -1329,13 +1317,13 @@ ; GFX8-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64) ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UV30]], [[UV32]] - ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]] - ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]] + ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX8-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64) ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]] - ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]] - ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]] + ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX8-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]] ; GFX8-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]] ; GFX8-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) @@ -1363,95 +1351,93 @@ ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[FPTOUI2]] ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[FPTOUI3]] ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[FPTOUI2]] - ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH16]] ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1) - ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH18]] ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD21]] + ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD20]] ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) - ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO50]] - ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO53]] - ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO53]] ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDO52]] ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[UADDO52]] - ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE14]] + ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE12]] ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[UADDO52]] - ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[MUL24]] - ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD27]] + ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[MUL24]] + ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD25]] ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[MUL24]] ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH21]] ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1) - ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[ADD27]] - ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[MUL24]] - ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD27]] + ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[ADD25]] + ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[MUL24]] + ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD25]] ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH23]] ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD28]] + ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD26]] ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) - ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[ADD27]] - ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[ADD25]] + ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[UADDO62]] - ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO53]] - ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[C6]], [[UADDO65]] + ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[ADD29]], [[UADDO65]] ; GFX8-NEXT: [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64) ; GFX8-NEXT: [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64) ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDO64]] - ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE18]] + ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE14]] ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDO64]] ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) ; GFX8-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH25]] ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1) - ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE18]] + ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE14]] ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDO64]] - ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE18]] + ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE14]] ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) ; GFX8-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH27]] ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1) - ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD32]] + ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD30]] ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1) - ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE18]] - ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] + ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE14]] + ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] ; GFX8-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[UADDO74]] ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV49]], [[UADDO74]] - ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD35]] + ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD33]] ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV48]], [[UADDO74]] - ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX8-NEXT: [[USUBO12:%[0-9]+]]:_(s32), [[USUBO13:%[0-9]+]]:_(s1) = G_USUBO [[UV44]], [[MUL33]] - ; GFX8-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD37]], [[USUBO13]] - ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD37]] + ; GFX8-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD35]], [[USUBO13]] + ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD35]] ; GFX8-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO12]](s32), [[USUBE16]](s32) ; GFX8-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV51]] @@ -1557,71 +1543,69 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]] ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]] ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]] @@ -1659,13 +1643,13 @@ ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64) ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UV30]], [[UV32]] - ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]] - ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32) + ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]] + ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32) ; GFX9-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64) ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]] - ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]] - ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32) + ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]] + ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32) ; GFX9-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]] ; GFX9-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]] ; GFX9-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) @@ -1693,95 +1677,93 @@ ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[FPTOUI2]] ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[FPTOUI3]] ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[FPTOUI2]] - ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH16]] ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1) - ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH18]] ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD21]] + ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD20]] ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) - ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO50]] - ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO53]] - ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO53]] ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDO52]] ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[UADDO52]] - ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE14]] + ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE12]] ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[UADDO52]] - ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[MUL24]] - ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD27]] + ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[MUL24]] + ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD25]] ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[MUL24]] ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH21]] ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1) - ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[ADD27]] - ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[MUL24]] - ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD27]] + ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[ADD25]] + ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[MUL24]] + ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD25]] ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH23]] ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD28]] + ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD26]] ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) - ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[ADD27]] - ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[ADD25]] + ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[UADDO62]] - ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO53]] - ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[C6]], [[UADDO65]] + ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[ADD29]], [[UADDO65]] ; GFX9-NEXT: [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64) ; GFX9-NEXT: [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64) ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDO64]] - ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE18]] + ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE14]] ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDO64]] ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) ; GFX9-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH25]] ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1) - ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE18]] + ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE14]] ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDO64]] - ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE18]] + ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE14]] ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) ; GFX9-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH27]] ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1) - ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD32]] + ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD30]] ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1) - ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE18]] - ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] + ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE14]] + ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] ; GFX9-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[UADDO74]] ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV49]], [[UADDO74]] - ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD35]] + ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD33]] ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV48]], [[UADDO74]] - ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX9-NEXT: [[USUBO12:%[0-9]+]]:_(s32), [[USUBO13:%[0-9]+]]:_(s1) = G_USUBO [[UV44]], [[MUL33]] - ; GFX9-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD37]], [[USUBO13]] - ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD37]] + ; GFX9-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD35]], [[USUBO13]] + ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD35]] ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO12]](s32), [[USUBE16]](s32) ; GFX9-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64) ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV51]] @@ -2482,71 +2464,69 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] @@ -2651,71 +2631,69 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] @@ -2820,71 +2798,69 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]] ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]] ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]] + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]] ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]] ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]] ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir @@ -313,72 +313,70 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -392,8 +390,8 @@ ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]] - ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]] + ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]] ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]] @@ -402,8 +400,8 @@ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]] - ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]] + ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]] ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]] @@ -463,72 +461,70 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -542,8 +538,8 @@ ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]] - ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]] + ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]] ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]] @@ -552,8 +548,8 @@ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]] - ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]] + ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]] ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]] @@ -613,72 +609,70 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -692,8 +686,8 @@ ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]] - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]] + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]] ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]] @@ -702,8 +696,8 @@ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]] - ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]] + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]] ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]] @@ -777,72 +771,70 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]] ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]] ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]] ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -856,8 +848,8 @@ ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV18]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV19]], [[UADDO35]] - ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV19]], [[UADDO35]] + ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV17]] ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV16]] @@ -866,8 +858,8 @@ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV20]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV21]], [[UADDO37]] - ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV21]], [[UADDO37]] + ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]] ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]] @@ -897,96 +889,94 @@ ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[FPTOUI2]] ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[FPTOUI3]] ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[FPTOUI2]] - ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1) ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH16]] ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1) - ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH18]] ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1) - ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD21]] + ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD20]] ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) - ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO46]] - ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO49]] - ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO49]] ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDO48]] ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[UADDO48]] - ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE10]] + ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE8]] ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[UADDO48]] - ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[MUL24]] - ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD27]] + ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[MUL24]] + ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD25]] ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[MUL24]] ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH21]] ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[ADD27]] - ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[MUL24]] - ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD27]] + ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[ADD25]] + ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[MUL24]] + ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD25]] ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH23]] ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1) - ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD28]] + ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD26]] ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) - ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[ADD27]] - ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[ADD25]] + ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[UADDO58]] - ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO49]] - ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[C5]], [[UADDO61]] + ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[ADD29]], [[UADDO61]] ; GFX6-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDO60]] - ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE14]] + ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE10]] ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDO60]] ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH25]] ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE14]] + ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE10]] ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDO60]] - ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE14]] + ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE10]] ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) ; GFX6-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH27]] ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1) - ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD32]] + ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD30]] ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) - ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE14]] - ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] - ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD35]](s32) + ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE10]] + ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] + ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD33]](s32) ; GFX6-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[UADDO70]] ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV33]], [[UADDO70]] - ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD35]] + ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD33]] ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV32]], [[UADDO70]] - ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX6-NEXT: [[USUBO8:%[0-9]+]]:_(s32), [[USUBO9:%[0-9]+]]:_(s1) = G_USUBO [[UV28]], [[MUL33]] - ; GFX6-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD37]], [[USUBO9]] - ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD37]] + ; GFX6-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD35]], [[USUBO9]] + ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD35]] ; GFX6-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE10]](s32), [[UV35]] ; GFX6-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1) @@ -999,8 +989,8 @@ ; GFX6-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[USUBE12]], [[C5]], [[USUBO11]] ; GFX6-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX6-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UV36]] - ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV37]], [[UADDO73]] - ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE16]](s32) + ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV37]], [[UADDO73]] + ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE12]](s32) ; GFX6-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV35]] ; GFX6-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1) ; GFX6-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO10]](s32), [[UV34]] @@ -1009,8 +999,8 @@ ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]] ; GFX6-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[UV38]] - ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[UV39]], [[UADDO75]] - ; GFX6-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE18]](s32) + ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[UV39]], [[UADDO75]] + ; GFX6-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE14]](s32) ; GFX6-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C5]] ; GFX6-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV5]], [[MV4]] ; GFX6-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C5]] @@ -1073,72 +1063,70 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]] ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]] ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]] ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -1152,8 +1140,8 @@ ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV18]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV19]], [[UADDO35]] - ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV19]], [[UADDO35]] + ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV17]] ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV16]] @@ -1162,8 +1150,8 @@ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV20]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV21]], [[UADDO37]] - ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV21]], [[UADDO37]] + ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]] ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]] @@ -1193,96 +1181,94 @@ ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[FPTOUI2]] ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[FPTOUI3]] ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[FPTOUI2]] - ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1) ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH16]] ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1) - ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH18]] ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1) - ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD21]] + ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD20]] ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) - ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO46]] - ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO49]] - ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO49]] ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDO48]] ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[UADDO48]] - ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE10]] + ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE8]] ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[UADDO48]] - ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[MUL24]] - ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD27]] + ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[MUL24]] + ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD25]] ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[MUL24]] ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH21]] ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[ADD27]] - ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[MUL24]] - ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD27]] + ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[ADD25]] + ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[MUL24]] + ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD25]] ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH23]] ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1) - ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD28]] + ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD26]] ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) - ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[ADD27]] - ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[ADD25]] + ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[UADDO58]] - ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO49]] - ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[C5]], [[UADDO61]] + ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[ADD29]], [[UADDO61]] ; GFX8-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDO60]] - ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE14]] + ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE10]] ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDO60]] ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH25]] ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE14]] + ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE10]] ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDO60]] - ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE14]] + ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE10]] ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) ; GFX8-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH27]] ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1) - ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD32]] + ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD30]] ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) - ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE14]] - ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] - ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD35]](s32) + ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE10]] + ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] + ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD33]](s32) ; GFX8-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[UADDO70]] ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV33]], [[UADDO70]] - ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD35]] + ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD33]] ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV32]], [[UADDO70]] - ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX8-NEXT: [[USUBO8:%[0-9]+]]:_(s32), [[USUBO9:%[0-9]+]]:_(s1) = G_USUBO [[UV28]], [[MUL33]] - ; GFX8-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD37]], [[USUBO9]] - ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD37]] + ; GFX8-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD35]], [[USUBO9]] + ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD35]] ; GFX8-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE10]](s32), [[UV35]] ; GFX8-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1) @@ -1295,8 +1281,8 @@ ; GFX8-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[USUBE12]], [[C5]], [[USUBO11]] ; GFX8-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX8-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UV36]] - ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV37]], [[UADDO73]] - ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE16]](s32) + ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV37]], [[UADDO73]] + ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE12]](s32) ; GFX8-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV35]] ; GFX8-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1) ; GFX8-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO10]](s32), [[UV34]] @@ -1305,8 +1291,8 @@ ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]] ; GFX8-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[UV38]] - ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[UV39]], [[UADDO75]] - ; GFX8-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE18]](s32) + ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[UV39]], [[UADDO75]] + ; GFX8-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE14]](s32) ; GFX8-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C5]] ; GFX8-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV5]], [[MV4]] ; GFX8-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C5]] @@ -1369,72 +1355,70 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]] ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]] ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]] ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -1448,8 +1432,8 @@ ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV18]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV19]], [[UADDO35]] - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV19]], [[UADDO35]] + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV17]] ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV16]] @@ -1458,8 +1442,8 @@ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV20]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV21]], [[UADDO37]] - ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV21]], [[UADDO37]] + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]] ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]] @@ -1489,96 +1473,94 @@ ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[FPTOUI2]] ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[FPTOUI3]] ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[FPTOUI2]] - ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1) ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH16]] ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1) - ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH18]] ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1) - ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD21]] + ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD20]] ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) - ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO46]] - ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO49]] - ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO49]] ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDO48]] ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[UADDO48]] - ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE10]] + ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE8]] ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[UADDO48]] - ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[MUL24]] - ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD27]] + ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[MUL24]] + ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD25]] ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[MUL24]] ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH21]] ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[ADD27]] - ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[MUL24]] - ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD27]] + ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[ADD25]] + ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[MUL24]] + ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD25]] ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH23]] ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1) - ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD28]] + ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD26]] ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) - ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[ADD27]] - ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[ADD25]] + ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[UADDO58]] - ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO49]] - ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[C5]], [[UADDO61]] + ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[ADD29]], [[UADDO61]] ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDO60]] - ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE14]] + ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE10]] ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDO60]] ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH25]] ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE14]] + ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE10]] ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDO60]] - ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE14]] + ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE10]] ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) ; GFX9-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH27]] ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1) - ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD32]] + ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD30]] ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1) - ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE14]] - ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] - ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD35]](s32) + ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE10]] + ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD33]](s32) ; GFX9-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[UADDO70]] ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV33]], [[UADDO70]] - ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD35]] + ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD33]] ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV32]], [[UADDO70]] - ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX9-NEXT: [[USUBO8:%[0-9]+]]:_(s32), [[USUBO9:%[0-9]+]]:_(s1) = G_USUBO [[UV28]], [[MUL33]] - ; GFX9-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD37]], [[USUBO9]] - ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD37]] + ; GFX9-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD35]], [[USUBO9]] + ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD35]] ; GFX9-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE10]](s32), [[UV35]] ; GFX9-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1) @@ -1591,8 +1573,8 @@ ; GFX9-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[USUBE12]], [[C5]], [[USUBO11]] ; GFX9-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX9-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UV36]] - ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV37]], [[UADDO73]] - ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE16]](s32) + ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV37]], [[UADDO73]] + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE12]](s32) ; GFX9-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV35]] ; GFX9-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1) ; GFX9-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO10]](s32), [[UV34]] @@ -1601,8 +1583,8 @@ ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]] ; GFX9-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64) ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[UV38]] - ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[UV39]], [[UADDO75]] - ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE18]](s32) + ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[UV39]], [[UADDO75]] + ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE14]](s32) ; GFX9-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C5]] ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV5]], [[MV4]] ; GFX9-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C5]] @@ -2178,72 +2160,70 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]] + ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -2257,8 +2237,8 @@ ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]] - ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]] + ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]] ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]] @@ -2267,8 +2247,8 @@ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]] - ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]] + ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -2331,72 +2311,70 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]] + ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -2410,8 +2388,8 @@ ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]] - ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]] + ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]] ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]] @@ -2420,8 +2398,8 @@ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]] - ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]] + ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] @@ -2484,72 +2462,70 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]] + ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32) + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1) @@ -2563,8 +2539,8 @@ ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]] - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32) + ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]] + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32) ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]] ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1) ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]] @@ -2573,8 +2549,8 @@ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]] ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64) ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]] - ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32) + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]] + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32) ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]] ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]] ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir @@ -289,71 +289,69 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] @@ -435,71 +433,69 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] @@ -581,71 +577,69 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] @@ -741,71 +735,69 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]] ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]] @@ -857,95 +849,93 @@ ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[FPTOUI2]] ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]] ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]] - ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UMULH16]] ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1) ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH18]] ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1) - ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD21]] + ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD20]] ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) - ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO42]] - ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO45]] - ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO45]] ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO44]] ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[UADDO44]] - ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE6]] + ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE4]] ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO44]] - ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[MUL24]] - ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD27]] + ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL24]] + ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD25]] ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[MUL24]] ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH21]] ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[ADD27]] - ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[MUL24]] - ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD27]] + ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD25]] + ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL24]] + ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD25]] ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH23]] ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD28]] + ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD26]] ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) - ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[ADD27]] - ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD25]] + ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[UADDO54]] - ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO45]] - ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[C5]], [[UADDO57]] + ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD29]], [[UADDO57]] ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDO56]] - ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE10]] + ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE6]] ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDO56]] ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH25]] ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE10]] + ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE6]] ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDO56]] - ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE10]] + ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE6]] ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH27]] ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD32]] + ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD30]] ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) - ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE10]] - ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] + ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE6]] + ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] ; GFX6-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[UADDO66]] ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV29]], [[UADDO66]] - ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD35]] + ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD33]] ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV28]], [[UADDO66]] - ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX6-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV24]], [[MUL33]] - ; GFX6-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD37]], [[USUBO11]] - ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD37]] + ; GFX6-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD35]], [[USUBO11]] + ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD35]] ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO10]](s32), [[USUBE14]](s32) ; GFX6-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV31]] @@ -1030,71 +1020,69 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]] ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]] @@ -1146,95 +1134,93 @@ ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[FPTOUI2]] ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]] ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]] - ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UMULH16]] ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1) ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH18]] ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1) - ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD21]] + ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD20]] ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) - ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO42]] - ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO45]] - ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO45]] ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO44]] ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[UADDO44]] - ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE6]] + ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE4]] ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO44]] - ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[MUL24]] - ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD27]] + ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL24]] + ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD25]] ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[MUL24]] ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH21]] ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[ADD27]] - ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[MUL24]] - ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD27]] + ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD25]] + ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL24]] + ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD25]] ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH23]] ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD28]] + ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD26]] ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) - ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[ADD27]] - ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD25]] + ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[UADDO54]] - ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO45]] - ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[C5]], [[UADDO57]] + ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD29]], [[UADDO57]] ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDO56]] - ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE10]] + ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE6]] ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDO56]] ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH25]] ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE10]] + ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE6]] ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDO56]] - ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE10]] + ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE6]] ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH27]] ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD32]] + ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD30]] ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) - ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE10]] - ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] + ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE6]] + ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] ; GFX8-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[UADDO66]] ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV29]], [[UADDO66]] - ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD35]] + ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD33]] ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV28]], [[UADDO66]] - ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX8-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV24]], [[MUL33]] - ; GFX8-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD37]], [[USUBO11]] - ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD37]] + ; GFX8-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD35]], [[USUBO11]] + ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD35]] ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO10]](s32), [[USUBE14]](s32) ; GFX8-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV31]] @@ -1319,71 +1305,69 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]] + ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]] ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]] @@ -1435,95 +1419,93 @@ ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[FPTOUI2]] ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]] ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]] - ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] - ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]] + ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]] + ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]] ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]] ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]] ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1) ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UMULH16]] ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1) - ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] - ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]] + ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]] + ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]] ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]] - ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]] + ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]] ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]] ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1) ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH18]] ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1) - ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] - ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD21]] + ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]] + ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD20]] ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1) - ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]] - ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]] - ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]] + ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]] + ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]] + ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]] ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO42]] - ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO45]] - ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]] + ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO45]] ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO44]] ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[UADDO44]] - ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE6]] + ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE4]] ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO44]] - ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] - ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]] - ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[MUL24]] - ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD27]] + ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]] + ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]] + ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL24]] + ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD25]] ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[MUL24]] ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]] ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1) ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH21]] ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1) - ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] - ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[ADD27]] - ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[MUL24]] - ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD27]] + ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]] + ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD25]] + ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL24]] + ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD25]] ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]] ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1) ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH23]] ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1) - ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] - ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD28]] + ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]] + ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD26]] ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1) - ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]] - ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[ADD27]] - ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]] + ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]] + ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD25]] + ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]] ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[UADDO54]] - ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO45]] - ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[C5]], [[UADDO57]] + ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD29]], [[UADDO57]] ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDO56]] - ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE10]] + ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE6]] ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDO56]] ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]] ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1) ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH25]] ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1) - ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] - ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE10]] + ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]] + ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE6]] ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDO56]] - ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE10]] + ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE6]] ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]] ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1) ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH27]] ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1) - ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] - ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD32]] + ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]] + ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD30]] ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1) - ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]] - ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE10]] - ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]] + ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]] + ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE6]] + ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]] ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[UADDO66]] ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV29]], [[UADDO66]] - ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD35]] + ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD33]] ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV28]], [[UADDO66]] - ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] - ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]] + ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]] + ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]] ; GFX9-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV24]], [[MUL33]] - ; GFX9-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD37]], [[USUBO11]] - ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD37]] + ; GFX9-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD35]], [[USUBO11]] + ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD35]] ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO10]](s32), [[USUBE14]](s32) ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV31]] @@ -2079,71 +2061,69 @@ ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]] + ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] @@ -2228,71 +2208,69 @@ ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]] + ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] @@ -2377,71 +2355,69 @@ ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]] ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]] - ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]] ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]] ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]] ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]] ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]] - ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] - ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]] + ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]] + ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]] ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]] ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]] ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]] ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1) - ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] - ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]] + ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]] + ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]] ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]] - ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]] + ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]] ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]] ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1) ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]] ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1) - ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] - ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]] + ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]] + ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]] ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1) - ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]] - ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]] - ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]] + ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]] + ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]] + ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]] ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]] - ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]] - ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]] + ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]] ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]] ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]] ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1) ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]] ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1) - ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] - ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]] + ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]] + ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]] ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]] - ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]] + ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]] ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]] ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1) ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]] ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1) - ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] - ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]] + ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]] + ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]] ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1) - ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]] - ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]] - ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]] + ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]] + ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]] + ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]] ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]] ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]] - ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]] + ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]] ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]] - ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] - ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]] + ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]] + ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]] ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]] - ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]] - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]] + ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]] + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64) ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -68,38 +68,36 @@ ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; CHECK-NEXT: v_addc_u32_e64 v11, s[4:5], v6, v10, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v9, v3 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v11 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v11, v8, v3 ; CHECK-NEXT: v_mul_hi_u32 v8, v8, v3 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13 -; CHECK-NEXT: v_mul_lo_u32 v12, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v13 -; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v3, v8 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v12, v3, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v11 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v12, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v8 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v6 ; CHECK-NEXT: v_mul_hi_u32 v10, v4, v3 @@ -225,24 +223,53 @@ ; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s11 ; CHECK-NEXT: s_mov_b32 s7, s6 ; CHECK-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] -; CHECK-NEXT: s_sub_u32 s3, 0, s10 +; CHECK-NEXT: s_sub_u32 s0, 0, s10 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b32 s1, 1, 0 +; CHECK-NEXT: s_and_b32 s1, s1, 1 +; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v1, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: s_subb_u32 s5, 0, s11 +; CHECK-NEXT: s_subb_u32 s1, 0, s11 +; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0 +; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 +; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1 +; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 +; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0 ; CHECK-NEXT: v_mov_b32_e32 v6, s11 -; CHECK-NEXT: v_mul_lo_u32 v3, s3, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, s5, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s3, v0 -; CHECK-NEXT: v_mul_lo_u32 v4, s3, v0 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4 @@ -267,38 +294,7 @@ ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v2, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, s5, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, s3, v3 -; CHECK-NEXT: v_mul_hi_u32 v8, s3, v0 -; CHECK-NEXT: v_mul_lo_u32 v7, s3, v0 -; CHECK-NEXT: v_add_i32_e64 v1, s[0:1], v1, v2 -; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v4, v8 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v4 -; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 -; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v7, s[0:1], v7, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 -; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v7, v5 -; CHECK-NEXT: v_add_i32_e64 v3, s[0:1], v3, v4 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1 ; CHECK-NEXT: v_mul_hi_u32 v5, s12, v0 @@ -453,38 +449,36 @@ ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_addc_u32_e64 v15, s[4:5], v10, v14, vcc +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v13, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v12, v15 -; GISEL-NEXT: v_mul_lo_u32 v17, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v12, v9 ; GISEL-NEXT: v_mul_hi_u32 v12, v12, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v17 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v9, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v15, v17 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v12 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v12 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v15 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v1, v9 ; GISEL-NEXT: v_mul_lo_u32 v13, v0, v10 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v9 @@ -591,40 +585,38 @@ ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v13, vcc +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v12, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v16 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v14, v16 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v11 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v8 ; GISEL-NEXT: v_mul_lo_u32 v12, v2, v9 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 @@ -754,38 +746,36 @@ ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 -; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v11, v14, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v14, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 ; CGP-NEXT: v_mul_hi_u32 v12, v12, v3 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v16, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v3, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 -; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v12, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v11, v15 +; CGP-NEXT: v_mul_lo_u32 v14, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v3, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v11, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v11, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v3, v12 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v10, v3 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v14, v4, v3 @@ -933,38 +923,36 @@ ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v9, v12, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v11, v5 -; CGP-NEXT: v_mul_lo_u32 v14, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v5 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v5 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v5 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 -; CGP-NEXT: v_mul_lo_u32 v14, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v10 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v5, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc ; CGP-NEXT: v_mul_lo_u32 v10, v8, v5 ; CGP-NEXT: v_mul_lo_u32 v11, v6, v9 ; CGP-NEXT: v_mul_hi_u32 v12, v6, v5 @@ -1063,32 +1051,62 @@ ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_movk_i32 s4, 0xf000 +; CHECK-NEXT: s_movk_i32 s6, 0x1000 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1096,7 +1114,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -1107,44 +1125,12 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1152,20 +1138,20 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 @@ -1178,7 +1164,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, s7 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 @@ -1193,12 +1179,12 @@ ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 4096 ret i64 %result @@ -1219,13 +1205,13 @@ ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 -; GISEL-NEXT: s_sub_u32 s11, 0, s8 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s8 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s12, 0, s9 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 @@ -1234,10 +1220,10 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 @@ -1264,39 +1250,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 @@ -1361,13 +1345,13 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 -; GISEL-NEXT: s_sub_u32 s8, 0, s6 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s6 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s9, 0, s7 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_subb_u32 s5, 0, s7 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 @@ -1377,12 +1361,12 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -1410,39 +1394,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 @@ -1511,32 +1493,31 @@ ; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_movk_i32 s6, 0xf000 +; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v4, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1555,50 +1536,49 @@ ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc @@ -1681,40 +1661,38 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 @@ -1786,32 +1764,62 @@ ; CHECK-LABEL: v_sdiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_mov_b32 s4, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1819,7 +1827,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -1830,44 +1838,12 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1875,20 +1851,20 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 @@ -1901,7 +1877,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, s7 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 @@ -1916,12 +1892,12 @@ ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -1942,13 +1918,13 @@ ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 -; GISEL-NEXT: s_sub_u32 s11, 0, s8 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s8 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s12, 0, s9 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 @@ -1957,10 +1933,10 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 @@ -1987,39 +1963,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 @@ -2084,13 +2058,13 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 -; GISEL-NEXT: s_sub_u32 s8, 0, s6 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s6 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s9, 0, s7 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_subb_u32 s5, 0, s7 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 @@ -2100,12 +2074,12 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -2133,39 +2107,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 @@ -2234,32 +2206,62 @@ ; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_mov_b32 s6, 0xffed2705 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v4, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2278,39 +2280,7 @@ ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 @@ -2404,40 +2374,38 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 @@ -2571,38 +2539,36 @@ ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CHECK-NEXT: v_addc_u32_e64 v11, s[4:5], v6, v10, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v9, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v11 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v11, v8, v5 ; CHECK-NEXT: v_mul_hi_u32 v8, v8, v5 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13 -; CHECK-NEXT: v_mul_lo_u32 v12, v5, v8 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13 -; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v5, v8 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v12, v5, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v11 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v12, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_mul_hi_u32 v10, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v9, v3, v6 ; CHECK-NEXT: v_mul_hi_u32 v10, v3, v5 @@ -2702,8 +2668,8 @@ ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[7:8], s[6:7], v4 +; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 +; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 @@ -2716,123 +2682,121 @@ ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_xor_b32_e32 v9, v0, v10 ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v0 -; GISEL-NEXT: v_xor_b32_e32 v17, v1, v10 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v15 -; GISEL-NEXT: v_mul_lo_u32 v16, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v1, v0, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v15 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v0, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v8, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v1 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v0 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v1, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v15 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v11 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v1, v1, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v13 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v12 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v17, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_lshl_b64 v[0:1], s[6:7], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v17, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v1 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v8 +; GISEL-NEXT: v_lshl_b64 v[0:1], s[4:5], v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v17, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v5, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v6 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v17, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v17, v11 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v17, v12 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v14, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], v14, v9 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v7, vcc +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v9, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, v14, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6 ; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v7 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v14, vcc @@ -2858,133 +2822,131 @@ ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v10 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v10, vcc ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_xor_b32_e32 v3, v1, v10 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v1, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v3, v3 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v8 ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v9, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v1 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v3 ; GISEL-NEXT: v_mul_hi_u32 v16, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v0 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 +; GISEL-NEXT: v_xor_b32_e32 v17, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v14, v1, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v3, v15 ; GISEL-NEXT: v_mul_lo_u32 v16, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v15 -; GISEL-NEXT: v_xor_b32_e32 v4, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v0, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v3, v15 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v1, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v3, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_mul_hi_u32 v16, v0, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v3, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v1, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v0 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v13 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v16 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v14, v16 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v11 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v12 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v4, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v11 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v3 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v5, v17, v11 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v6, v2, v3 ; GISEL-NEXT: v_mul_hi_u32 v11, v2, v11 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v7, v17, v3 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v2, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v17, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v2, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v9 ; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v8 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v8 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -3059,38 +3021,36 @@ ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 -; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v9, v14, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v9 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 ; CGP-NEXT: v_mul_hi_u32 v12, v12, v3 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v16, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v3, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 -; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v15 +; CGP-NEXT: v_mul_lo_u32 v14, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v3, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v9, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v3, v12 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v4, v3 @@ -3238,38 +3198,36 @@ ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v11, v6 -; CGP-NEXT: v_mul_lo_u32 v14, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v8 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v6 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v6 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 -; CGP-NEXT: v_mul_lo_u32 v14, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v6, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v6, v10 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v8, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v6, v10 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc ; CGP-NEXT: v_mul_lo_u32 v10, v7, v6 ; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 ; CGP-NEXT: v_mul_hi_u32 v12, v5, v6 @@ -3429,90 +3387,88 @@ ; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 ; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 -; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v4, v4 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v2 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v2 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v2 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v11 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v10 +; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v7, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v2, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v4, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v2 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v7, v2 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0, v5 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v2 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v4 ; GISEL-NEXT: v_mul_hi_u32 v9, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 @@ -3520,7 +3476,7 @@ ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -3532,8 +3488,8 @@ ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v13, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v7 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v11, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v11, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] @@ -3551,147 +3507,145 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 0, v6 -; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v6 +; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v12, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v4, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v3 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v3 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 0, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v3, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v5, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v2 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v2 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v2, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v2, v10 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], v4, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v3 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v5, v0 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v11 -; GISEL-NEXT: v_mul_lo_u32 v10, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v9, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v3, v5 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v9 -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v3 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v0 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v6 ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v2 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v2, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v2 -; GISEL-NEXT: v_mul_lo_u32 v5, v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v2 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v13, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v14, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v14, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v5, v2 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v2 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v12, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -168,9 +168,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s14, 0, s8 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8-NEXT: s_and_b32 s0, s0, 1 +; GFX8-NEXT: s_sub_u32 s0, 0, s8 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -178,17 +178,46 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s15, 0, s9 -; GFX8-NEXT: v_mul_lo_u32 v2, s14, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, s15, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s14, v0 -; GFX8-NEXT: v_mul_lo_u32 v4, s14, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_subb_u32 s1, 0, s9 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4 +; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v7, v0, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 @@ -209,38 +238,7 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s15, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, s14, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, s14, v0 -; GFX8-NEXT: v_mul_lo_u32 v7, s14, v0 -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v5 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v8 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v5, s10, v0 @@ -297,15 +295,16 @@ ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 @@ -348,9 +347,9 @@ ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s14, 0, s8 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -358,12 +357,12 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s15, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s15, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s14, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s14, v0 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, s11 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 @@ -387,36 +386,34 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s14, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, s14, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, s14, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v5, s10, v0 @@ -472,15 +469,16 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 @@ -512,16 +510,16 @@ ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_addc_u32 s9, s11, s12 -; GFX10-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX10-NEXT: s_sub_u32 s1, 0, s8 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: s_sub_u32 s10, 0, s8 +; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: s_and_b32 s11, s11, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: s_subb_u32 s14, 0, s9 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_subb_u32 s11, 0, s9 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -530,11 +528,11 @@ ; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX10-NEXT: v_mul_lo_u32 v2, s1, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, s10, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s14, v0 -; GFX10-NEXT: v_mul_hi_u32 v4, s1, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s11, v0 +; GFX10-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX10-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v1, v5 @@ -543,111 +541,110 @@ ; GFX10-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX10-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s14, v3, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v6, s14, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v3, s14, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v5, s14, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s14 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v7, v6 -; GFX10-NEXT: v_add_co_u32 v3, s0, v5, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s14, v5, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v4, s14, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, s1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v1, v2, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v7, s1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v7 -; GFX10-NEXT: v_add3_u32 v4, v4, v6, v5 -; GFX10-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX10-NEXT: v_mul_hi_u32 v7, v0, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX10-NEXT: v_mul_lo_u32 v9, v3, v4 -; GFX10-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX10-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v8, s0, v9, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v7, s0, v8, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v9, v8 -; GFX10-NEXT: v_add_co_u32 v5, s0, v7, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add3_u32 v2, v4, v6, v3 +; GFX10-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v2, s11, v0 -; GFX10-NEXT: v_mul_hi_u32 v4, s11, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX10-NEXT: v_mul_lo_u32 v5, s11, v1 -; GFX10-NEXT: v_mul_hi_u32 v6, s10, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v5, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v4, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v4, s10, v1 +; GFX10-NEXT: v_mul_hi_u32 v6, v1, v5 +; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, v1, v2 +; GFX10-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX10-NEXT: v_add_co_u32 v3, s10, v3, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v3, s10, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v5, s10, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v7, v6 +; GFX10-NEXT: v_add_co_u32 v3, s10, v5, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2 +; GFX10-NEXT: v_mul_hi_u32 v4, s1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 +; GFX10-NEXT: v_mul_hi_u32 v6, s0, v1 +; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v4, s10, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v2, s10, v4, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10 ; GFX10-NEXT: v_mul_lo_u32 v5, s8, v0 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1 ; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, s8, v0 ; GFX10-NEXT: v_mul_lo_u32 v4, s8, v1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v0, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s11, v2 -; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s10, v5 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v2, s0, s11, v2, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v8, s1, v2 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s0, v5 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v2, s0, s1, v2, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v5, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v5, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v9 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s9, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v13, s0, v3, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v2 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, v12, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v13, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v10, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v12 +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v9 ; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v6, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s8, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s9, v1 @@ -1365,9 +1362,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s16, 0, s8 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8-NEXT: s_and_b32 s0, s0, 1 +; GFX8-NEXT: s_sub_u32 s0, 0, s8 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -1375,17 +1372,46 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s17, 0, s9 -; GFX8-NEXT: v_mul_lo_u32 v2, s16, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, s17, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s16, v0 -; GFX8-NEXT: v_mul_lo_u32 v4, s16, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_subb_u32 s1, 0, s9 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4 +; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v7, v0, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 @@ -1406,38 +1432,7 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s17, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, s16, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, s16, v0 -; GFX8-NEXT: v_mul_lo_u32 v7, s16, v0 -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v5 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v8 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s15, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s14, v1 ; GFX8-NEXT: v_mul_hi_u32 v5, s14, v0 @@ -1494,17 +1489,18 @@ ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: s_ashr_i32 s8, s11, 31 ; GFX8-NEXT: s_ashr_i32 s12, s3, 31 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: s_add_u32 s0, s10, s8 ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -1523,32 +1519,61 @@ ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v4 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX8-NEXT: s_sub_u32 s0, 0, s2 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 +; GFX8-NEXT: v_trunc_f32_e32 v6, v6 +; GFX8-NEXT: v_mul_f32_e32 v7, 0xcf800000, v6 +; GFX8-NEXT: v_add_f32_e32 v4, v7, v4 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_subb_u32 s1, 0, s3 +; GFX8-NEXT: v_mul_lo_u32 v4, s1, v7 +; GFX8-NEXT: v_mul_lo_u32 v8, s0, v6 +; GFX8-NEXT: v_mul_hi_u32 v10, s0, v7 +; GFX8-NEXT: v_mul_lo_u32 v9, s0, v7 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v10 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mul_lo_u32 v10, v6, v9 +; GFX8-NEXT: v_mul_lo_u32 v11, v7, v8 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v2, v6, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v7 -; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: v_trunc_f32_e32 v3, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] -; GFX8-NEXT: v_mul_f32_e32 v6, 0xcf800000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX8-NEXT: s_sub_u32 s10, 0, s2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8-NEXT: s_and_b32 s0, s0, 1 -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s11, 0, s3 -; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX8-NEXT: v_mul_hi_u32 v9, s10, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s10, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v2, v5, vcc +; GFX8-NEXT: v_mul_hi_u32 v2, v7, v9 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v10, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v8 +; GFX8-NEXT: v_mul_hi_u32 v9, v6, v9 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_mul_hi_u32 v10, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v10 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v9, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s1, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s0, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s0, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v10, s3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 @@ -1556,6 +1581,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v9, v2, v6 ; GFX8-NEXT: v_mul_hi_u32 v11, v2, v8 ; GFX8-NEXT: v_mul_hi_u32 v8, v3, v8 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 @@ -1574,56 +1600,25 @@ ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 -; GFX8-NEXT: v_addc_u32_e64 v7, s[0:1], v3, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, s11, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 -; GFX8-NEXT: v_mul_hi_u32 v12, s10, v2 -; GFX8-NEXT: v_mul_lo_u32 v11, s10, v2 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 -; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v9 -; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v12 -; GFX8-NEXT: v_mul_lo_u32 v9, v7, v11 -; GFX8-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX8-NEXT: v_mul_hi_u32 v6, v2, v11 -; GFX8-NEXT: v_mul_hi_u32 v11, v7, v11 -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v9, v7, v8 -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v12, v6 -; GFX8-NEXT: v_mul_hi_u32 v12, v2, v8 -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v11, s[0:1], v11, v12 -; GFX8-NEXT: v_mul_hi_u32 v7, v7, v8 -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v11, v9 -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, s7, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX8-NEXT: v_mul_hi_u32 v9, s6, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, s7 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s10, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, s7, v3 +; GFX8-NEXT: v_mul_lo_u32 v9, s11, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX8-NEXT: v_mul_hi_u32 v7, s10, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 @@ -1634,9 +1629,9 @@ ; GFX8-NEXT: v_mul_lo_u32 v9, s2, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s6, v9 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v9 ; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s7, v6 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s11, v6 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v7 @@ -1665,7 +1660,6 @@ ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] @@ -1717,9 +1711,9 @@ ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s16, 0, s8 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -1727,12 +1721,13 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s17, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v2, s16, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s16, v0 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 @@ -1755,41 +1750,39 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s17, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s16, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, s16, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, s16, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s15, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s14, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1810,7 +1803,6 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, s8, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s14, v6 ; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc @@ -1829,24 +1821,17 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s8, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s8, v7 +; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] ; GFX9-NEXT: s_ashr_i32 s8, s11, 31 ; GFX9-NEXT: s_ashr_i32 s12, s3, 31 @@ -1861,112 +1846,118 @@ ; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: s_addc_u32 s3, s3, s12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v6 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9] -; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: s_sub_u32 s7, 0, s2 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v7, 0xcf800000, v6 -; GFX9-NEXT: v_add_f32_e32 v4, v7, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, 0xcf800000, v5 +; GFX9-NEXT: v_add_f32_e32 v4, v6, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: s_cselect_b32 s14, 1, 0 +; GFX9-NEXT: s_and_b32 s14, s14, 1 +; GFX9-NEXT: s_cmp_lg_u32 s14, 0 ; GFX9-NEXT: s_subb_u32 s14, 0, s3 -; GFX9-NEXT: v_mul_lo_u32 v8, s14, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, s7, v6 -; GFX9-NEXT: v_mul_hi_u32 v10, s7, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s7, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_add3_u32 v6, v6, v7, v8 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v10, s1 +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v9 +; GFX9-NEXT: v_mul_lo_u32 v8, v4, v6 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-NEXT: v_add3_u32 v5, v8, v9, v10 -; GFX9-NEXT: v_mul_lo_u32 v8, v6, v7 -; GFX9-NEXT: v_mul_lo_u32 v9, v4, v5 -; GFX9-NEXT: v_mul_hi_u32 v10, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX9-NEXT: v_xor_b32_e32 v3, s6, v3 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v4, v9 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v4, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v6, v5 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v5, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v9 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 ; GFX9-NEXT: v_add_u32_e32 v9, v10, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v5, v9, v8, v5 +; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v6, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s14, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, s7, v7 -; GFX9-NEXT: v_mul_hi_u32 v10, s7, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, s7, v4 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s7, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX9-NEXT: v_xor_b32_e32 v2, s6, v2 -; GFX9-NEXT: v_add3_u32 v8, v8, v9, v10 -; GFX9-NEXT: v_mul_lo_u32 v9, v7, v11 -; GFX9-NEXT: v_mul_lo_u32 v10, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v6, v4, v11 -; GFX9-NEXT: v_mul_hi_u32 v11, v7, v11 -; GFX9-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v9, v7, v8 -; GFX9-NEXT: v_add_u32_e32 v6, v10, v6 -; GFX9-NEXT: v_mul_hi_u32 v10, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v7, v7, v8 -; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v9, v6 -; GFX9-NEXT: v_add_u32_e32 v10, v11, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v7, v10, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6 -; GFX9-NEXT: v_mul_lo_u32 v9, s10, v7 +; GFX9-NEXT: v_add3_u32 v6, v6, v7, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v9 +; GFX9-NEXT: v_mul_lo_u32 v8, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v11, v4, v9 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v9 +; GFX9-NEXT: v_mov_b32_e32 v10, s6 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v11, v5, v6 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v11, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v9, v11, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v4, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v5, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v7 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v2, v12, vcc -; GFX9-NEXT: v_mul_hi_u32 v2, s10, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v2, v10, vcc +; GFX9-NEXT: v_mul_hi_u32 v2, s10, v7 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s11, v7 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, s10, v7 +; GFX9-NEXT: v_mul_lo_u32 v3, s11, v6 ; GFX9-NEXT: v_mul_hi_u32 v7, s11, v7 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v6, v3, v7 +; GFX9-NEXT: v_add3_u32 v3, v7, v3, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 ; GFX9-NEXT: v_mul_hi_u32 v8, s2, v2 @@ -2047,57 +2038,57 @@ ; GFX10-NEXT: s_mov_b32 s7, s6 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, s6 -; GFX10-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] ; GFX10-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX10-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX10-NEXT: s_sub_u32 s22, 0, s8 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_sub_u32 s20, 0, s8 +; GFX10-NEXT: s_cselect_b32 s14, 1, 0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s14, s14, 1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: s_subb_u32 s23, 0, s9 -; GFX10-NEXT: s_ashr_i32 s16, s11, 31 -; GFX10-NEXT: s_xor_b64 s[20:21], s[12:13], s[6:7] -; GFX10-NEXT: s_ashr_i32 s18, s3, 31 +; GFX10-NEXT: s_cmp_lg_u32 s14, 0 +; GFX10-NEXT: s_subb_u32 s21, 0, s9 +; GFX10-NEXT: s_ashr_i32 s14, s11, 31 +; GFX10-NEXT: s_xor_b64 s[18:19], s[12:13], s[6:7] +; GFX10-NEXT: s_ashr_i32 s16, s3, 31 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: s_add_u32 s0, s10, s16 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_mov_b32 s19, s18 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_add_u32 s6, s10, s14 +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_addc_u32 s1, s11, s16 -; GFX10-NEXT: s_add_u32 s2, s2, s18 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_and_b32 s7, s7, 1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_cmp_lg_u32 s7, 0 +; GFX10-NEXT: s_mov_b32 s15, s14 +; GFX10-NEXT: s_addc_u32 s7, s11, s14 +; GFX10-NEXT: s_add_u32 s2, s2, s16 +; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: s_and_b32 s10, s10, 1 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX10-NEXT: s_addc_u32 s3, s3, s18 -; GFX10-NEXT: s_xor_b64 s[10:11], s[0:1], s[16:17] -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[18:19] +; GFX10-NEXT: s_addc_u32 s3, s3, s16 +; GFX10-NEXT: s_xor_b64 s[10:11], s[6:7], s[14:15] +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX10-NEXT: s_sub_u32 s6, 0, s2 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2 -; GFX10-NEXT: s_and_b32 s0, s0, 1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_and_b32 s7, s7, 1 +; GFX10-NEXT: s_cmp_lg_u32 s7, 0 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: s_subb_u32 s7, 0, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s22, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, s20, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX10-NEXT: v_mul_lo_u32 v4, s23, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, s22, v0 -; GFX10-NEXT: v_mul_lo_u32 v6, s22, v0 +; GFX10-NEXT: v_mul_lo_u32 v4, s21, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, s20, v0 +; GFX10-NEXT: v_mul_lo_u32 v6, s20, v0 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1 ; GFX10-NEXT: v_add3_u32 v3, v4, v3, v5 ; GFX10-NEXT: v_trunc_f32_e32 v4, v7 @@ -2111,23 +2102,23 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_add_f32_e32 v1, v9, v1 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v10, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7 +; GFX10-NEXT: v_add_co_u32 v5, s13, v5, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s13 +; GFX10-NEXT: v_add_co_u32 v6, s13, v10, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s13 +; GFX10-NEXT: v_add_co_u32 v5, s13, v5, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s13 ; GFX10-NEXT: v_mul_lo_u32 v9, s6, v4 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 +; GFX10-NEXT: v_add_co_u32 v6, s13, v6, v11 ; GFX10-NEXT: v_mul_lo_u32 v12, s7, v1 ; GFX10-NEXT: v_mul_hi_u32 v13, s6, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s13 ; GFX10-NEXT: v_mul_lo_u32 v11, s6, v1 -; GFX10-NEXT: v_add_co_u32 v5, s0, v6, v5 +; GFX10-NEXT: v_add_co_u32 v5, s13, v6, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s13 ; GFX10-NEXT: v_add3_u32 v8, v12, v9, v13 ; GFX10-NEXT: v_mul_lo_u32 v9, v4, v11 ; GFX10-NEXT: v_mul_hi_u32 v10, v1, v11 @@ -2136,116 +2127,112 @@ ; GFX10-NEXT: v_mul_lo_u32 v6, v1, v8 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v8 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 -; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s0, v2, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v8 -; GFX10-NEXT: v_mul_lo_u32 v14, s23, v0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v9, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v11 -; GFX10-NEXT: v_mul_hi_u32 v15, s22, v0 -; GFX10-NEXT: v_mul_lo_u32 v16, s22, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mul_lo_u32 v13, s22, v0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v7, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v12, s21, v0 +; GFX10-NEXT: v_add_co_u32 v6, s13, v9, v6 +; GFX10-NEXT: v_mul_hi_u32 v13, s20, v0 +; GFX10-NEXT: v_mul_lo_u32 v14, s20, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s13 +; GFX10-NEXT: v_add_co_u32 v7, s13, v7, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s13 +; GFX10-NEXT: v_add_co_u32 v6, s13, v6, v10 +; GFX10-NEXT: v_mul_lo_u32 v3, s20, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s13 +; GFX10-NEXT: v_add_co_u32 v5, s13, v7, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s13 +; GFX10-NEXT: v_add3_u32 v12, v12, v14, v13 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6 -; GFX10-NEXT: v_add3_u32 v14, v14, v16, v15 ; GFX10-NEXT: v_mul_hi_u32 v8, v4, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_mul_lo_u32 v10, v12, v13 +; GFX10-NEXT: v_mul_lo_u32 v10, v2, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v6 -; GFX10-NEXT: v_mul_lo_u32 v11, v0, v14 -; GFX10-NEXT: v_mul_hi_u32 v9, v0, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mul_hi_u32 v13, v12, v13 -; GFX10-NEXT: v_mul_lo_u32 v15, v12, v14 -; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5 +; GFX10-NEXT: v_mul_lo_u32 v11, v0, v12 +; GFX10-NEXT: v_add_co_u32 v5, s13, v5, v6 +; GFX10-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s13 +; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v13, v2, v12 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5 +; GFX10-NEXT: v_add_co_u32 v5, s13, v10, v11 +; GFX10-NEXT: v_mul_hi_u32 v14, v0, v12 ; GFX10-NEXT: v_add3_u32 v6, v7, v6, v8 -; GFX10-NEXT: v_add_co_u32 v5, s1, v10, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 -; GFX10-NEXT: v_mul_hi_u32 v16, v0, v14 -; GFX10-NEXT: v_add_co_u32 v8, s1, v15, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s1, v4, v6, s0 -; GFX10-NEXT: v_add_co_u32 v5, s1, v5, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 -; GFX10-NEXT: v_mul_lo_u32 v13, s7, v1 +; GFX10-NEXT: v_mul_lo_u32 v10, s7, v1 +; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s13 +; GFX10-NEXT: v_add_co_u32 v3, s13, v13, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX10-NEXT: v_add_co_u32 v3, s7, v3, v14 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v5 -; GFX10-NEXT: v_mul_hi_u32 v15, s6, v1 -; GFX10-NEXT: v_mul_lo_u32 v9, s6, v11 -; GFX10-NEXT: v_mul_hi_u32 v7, v12, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v16 -; GFX10-NEXT: v_add_co_u32 v5, s1, v8, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_add3_u32 v9, v13, v9, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s13 +; GFX10-NEXT: v_mul_hi_u32 v11, s6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s7 +; GFX10-NEXT: v_mul_lo_u32 v13, s6, v4 +; GFX10-NEXT: v_mul_hi_u32 v7, v2, v12 +; GFX10-NEXT: v_mul_lo_u32 v6, s6, v1 +; GFX10-NEXT: v_add_co_u32 v3, s6, v3, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add3_u32 v9, v10, v13, v11 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX10-NEXT: v_add3_u32 v5, v8, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v10, v4, v6 +; GFX10-NEXT: v_mul_lo_u32 v7, v1, v9 +; GFX10-NEXT: v_mul_hi_u32 v11, v1, v6 +; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v3, v4, v9 +; GFX10-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX10-NEXT: v_mul_hi_u32 v12, s0, v0 +; GFX10-NEXT: v_mul_lo_u32 v13, s0, v2 +; GFX10-NEXT: v_add_co_u32 v7, s6, v10, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s6 +; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX10-NEXT: v_add_co_u32 v3, s6, v3, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v7, s6, v7, v11 +; GFX10-NEXT: v_mul_lo_u32 v14, s1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v5, s6, v5, v13 +; GFX10-NEXT: v_mul_hi_u32 v15, s0, v2 +; GFX10-NEXT: v_mul_hi_u32 v8, v1, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v12 +; GFX10-NEXT: v_add_co_u32 v0, s6, v14, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 +; GFX10-NEXT: v_mul_hi_u32 v2, s1, v2 +; GFX10-NEXT: v_add_co_u32 v0, s6, v0, v15 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v3, s6, v3, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v0, s6, v0, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v12, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7 +; GFX10-NEXT: v_mul_hi_u32 v9, v4, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v8, s8, v0 +; GFX10-NEXT: v_add3_u32 v2, v11, v5, v2 +; GFX10-NEXT: v_add_co_u32 v3, s6, v3, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_mul_lo_u32 v7, s9, v0 +; GFX10-NEXT: v_mul_lo_u32 v10, s8, v2 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; GFX10-NEXT: v_add3_u32 v5, v6, v5, v9 +; GFX10-NEXT: v_mul_lo_u32 v6, s8, v0 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_add3_u32 v7, v10, v8, v7 -; GFX10-NEXT: v_mul_lo_u32 v14, v1, v9 -; GFX10-NEXT: v_mul_lo_u32 v12, v11, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v7, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v13, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v3, v11, v3 -; GFX10-NEXT: v_mul_lo_u32 v8, v11, v9 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v10, v1, v9 -; GFX10-NEXT: v_add_co_u32 v7, s1, v12, v14 -; GFX10-NEXT: v_mul_hi_u32 v9, v11, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v3, s1, v8, v3 -; GFX10-NEXT: v_mul_lo_u32 v8, s15, v0 -; GFX10-NEXT: v_mul_lo_u32 v14, s14, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 -; GFX10-NEXT: v_mul_hi_u32 v12, s14, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s15, v0 -; GFX10-NEXT: v_add_co_u32 v7, s1, v7, v13 -; GFX10-NEXT: v_mul_lo_u32 v13, s15, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v3, s1, v3, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v0, s1, v13, v0 -; GFX10-NEXT: v_mul_hi_u32 v15, s14, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 -; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v10 -; GFX10-NEXT: v_add_co_u32 v0, s1, v0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v14, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v0, s1, v0, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v13, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v3, s1, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 -; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0 -; GFX10-NEXT: v_add3_u32 v2, v10, v8, v2 -; GFX10-NEXT: v_add3_u32 v5, v5, v7, v9 -; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0 -; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, vcc_lo, v4, v5, s0 -; GFX10-NEXT: v_mul_lo_u32 v5, s8, v0 -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo -; GFX10-NEXT: v_add3_u32 v4, v6, v8, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1 -; GFX10-NEXT: v_mul_hi_u32 v7, s11, v1 -; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s14, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s15, v4 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s15, v4, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v7, v10, v8 +; GFX10-NEXT: v_mul_lo_u32 v5, s11, v1 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, s0, v6 ; GFX10-NEXT: v_mul_lo_u32 v14, s10, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v8, s1, v4 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s1, v4, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, s11, v1 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 @@ -2253,104 +2240,105 @@ ; GFX10-NEXT: v_mul_hi_u32 v1, s10, v1 ; GFX10-NEXT: v_mul_hi_u32 v17, s10, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v5, s8 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v6, s8 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v4 ; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, v10, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v13 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v14 +; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v15, v7 -; GFX10-NEXT: v_add_co_u32 v1, s1, v6, v1 +; GFX10-NEXT: v_add_co_u32 v1, s1, v5, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v17 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v17, s0, v0, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v2, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v14, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v13 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v15 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v16, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v14, s0, v17, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v18, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_add3_u32 v3, v5, v1, v3 +; GFX10-NEXT: v_sub_co_u32 v1, s0, v12, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v17, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v18, v15, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v15, s3, v7 +; GFX10-NEXT: v_mul_lo_u32 v16, s2, v3 +; GFX10-NEXT: v_mul_hi_u32 v17, s2, v7 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX10-NEXT: v_mul_lo_u32 v10, s2, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v11 -; GFX10-NEXT: v_add3_u32 v3, v6, v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v10, s3, v7 -; GFX10-NEXT: v_mul_lo_u32 v16, s2, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v15, s0 -; GFX10-NEXT: v_mul_lo_u32 v11, s2, v3 -; GFX10-NEXT: v_mul_hi_u32 v15, s2, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v14, s0 -; GFX10-NEXT: v_sub_co_u32 v14, s1, v12, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo -; GFX10-NEXT: v_add3_u32 v6, v10, v11, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v8, s0 -; GFX10-NEXT: v_sub_co_u32 v8, s0, s10, v16 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s11, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s11, v6 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v10 -; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 -; GFX10-NEXT: v_xor_b32_e32 v5, s12, v5 -; GFX10-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v4, vcc_lo, s3, v4, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s2 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v4, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s20 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s21, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc_lo +; GFX10-NEXT: v_add3_u32 v8, v15, v16, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v5, s0 +; GFX10-NEXT: v_sub_co_u32 v10, s0, s10, v10 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s1, s11, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s11, v8 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v11 +; GFX10-NEXT: v_xor_b32_e32 v0, s18, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, s19, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, vcc_lo, s3, v1, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v10, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v8, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v11 +; GFX10-NEXT: v_xor_b32_e32 v2, s12, v6 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v12, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v15, s0, v7, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v14, s0 -; GFX10-NEXT: v_add_co_u32 v14, s0, v15, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 +; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: v_sub_co_u32 v11, s0, v12, s2 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v12, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v8, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v8, s12, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v12, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v4, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[16:17], s[18:19] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v5, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v11, v6, s0 +; GFX10-NEXT: s_xor_b64 s[0:1], s[14:15], s[16:17] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s12 ; GFX10-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX10-NEXT: v_xor_b32_e32 v3, s1, v3 -; GFX10-NEXT: v_xor_b32_e32 v6, s16, v6 -; GFX10-NEXT: v_xor_b32_e32 v7, s16, v8 +; GFX10-NEXT: v_xor_b32_e32 v7, s14, v10 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v8, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v8, s14, v6 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, s16 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s16, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s14 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s14, v8, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] ; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -21,40 +21,40 @@ ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 ; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v3 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v1 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v6, v6 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 ; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v1 +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CHECK-NEXT: v_mul_lo_u32 v10, v6, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 ; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v11 -; CHECK-NEXT: v_xor_b32_e32 v5, v5, v1 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v13, v6, v9 +; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 @@ -62,102 +62,100 @@ ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v9, vcc +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v7, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v2 ; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 -; CHECK-NEXT: v_mul_lo_u32 v11, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CHECK-NEXT: v_mul_hi_u32 v11, v2, v7 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v10 +; CHECK-NEXT: v_mul_lo_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v10 +; CHECK-NEXT: v_mul_hi_u32 v10, v5, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v5, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v6 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v5, v2 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v4, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v5, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v4, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v0 ; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v1 ; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v3 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v2, v2, v1 -; CHECK-NEXT: v_xor_b32_e32 v3, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_xor_b32_e32 v2, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v2, v6, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: BB0_2: ; %Flow @@ -221,24 +219,53 @@ ; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s9 ; CHECK-NEXT: s_mov_b32 s7, s6 ; CHECK-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] -; CHECK-NEXT: s_sub_u32 s3, 0, s8 +; CHECK-NEXT: s_sub_u32 s0, 0, s8 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b32 s1, 1, 0 +; CHECK-NEXT: s_and_b32 s1, s1, 1 +; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v1, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: s_subb_u32 s5, 0, s9 +; CHECK-NEXT: s_subb_u32 s1, 0, s9 +; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0 +; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 +; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1 +; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 +; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0 ; CHECK-NEXT: v_mov_b32_e32 v6, s9 -; CHECK-NEXT: v_mul_lo_u32 v3, s3, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, s5, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s3, v0 -; CHECK-NEXT: v_mul_lo_u32 v4, s3, v0 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4 @@ -263,38 +290,7 @@ ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v2, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, s5, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, s3, v3 -; CHECK-NEXT: v_mul_hi_u32 v8, s3, v0 -; CHECK-NEXT: v_mul_lo_u32 v7, s3, v0 -; CHECK-NEXT: v_add_i32_e64 v1, s[0:1], v1, v2 -; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v4, v8 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v4 -; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 -; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v7, s[0:1], v7, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 -; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v7, v5 -; CHECK-NEXT: v_add_i32_e64 v3, s[0:1], v3, v4 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1 ; CHECK-NEXT: v_mul_hi_u32 v5, s10, v0 @@ -445,38 +441,36 @@ ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v12, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v16 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v14, v16 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v11 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v14 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v15, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v8 ; GISEL-NEXT: v_mul_lo_u32 v12, v0, v10 ; GISEL-NEXT: v_mul_hi_u32 v13, v0, v8 @@ -581,40 +575,38 @@ ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v11, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v10, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v13, v15 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v10 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7 ; GISEL-NEXT: v_mul_lo_u32 v11, v2, v8 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 @@ -695,35 +687,35 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 ; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 ; CGP-NEXT: v_xor_b32_e32 v0, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v11 -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v11 +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v1 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v11, v1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v11, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v10, v10 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 ; CGP-NEXT: v_mul_lo_u32 v14, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v16, v11, v2 ; CGP-NEXT: v_mul_lo_u32 v15, v11, v2 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v1 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_mul_lo_u32 v14, v10, v15 ; CGP-NEXT: v_mul_lo_u32 v16, v2, v13 ; CGP-NEXT: v_mul_hi_u32 v17, v2, v15 ; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v1 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -742,41 +734,39 @@ ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v16, v11, v2 +; CGP-NEXT: v_mul_lo_u32 v13, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v2 ; CGP-NEXT: v_mul_hi_u32 v11, v11, v2 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 -; CGP-NEXT: v_mul_lo_u32 v15, v2, v11 -; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v2, v11 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v10, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v10, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -784,7 +774,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v3, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 @@ -796,42 +786,42 @@ ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_mul_lo_u32 v11, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v10, v3, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v3, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v1, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v5, v2, vcc ; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v0 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v4, v3 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v3, v1 ; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v3 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v1 -; CGP-NEXT: v_xor_b32_e32 v3, v0, v1 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_xor_b32_e32 v2, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: BB2_2: ; %Flow2 @@ -870,35 +860,35 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v2, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 -; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v9 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v3 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v9, v3, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v6 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v9, v6, vcc ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v12, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v3 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v6 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; CGP-NEXT: v_mul_lo_u32 v12, v8, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v3 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v6 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 @@ -917,41 +907,39 @@ ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc ; CGP-NEXT: v_mul_lo_u32 v10, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v4 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v4 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v12 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -959,7 +947,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 @@ -971,42 +959,42 @@ ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v5, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v5, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v3, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v7, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v6, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v3 ; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v4, v3 -; CGP-NEXT: v_xor_b32_e32 v5, v2, v3 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v4, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 +; CGP-NEXT: v_xor_b32_e32 v4, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v3, v6 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v4, v6, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB2_6: ; %Flow @@ -1043,32 +1031,62 @@ ; CHECK-LABEL: v_srem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_movk_i32 s4, 0xf000 +; CHECK-NEXT: s_movk_i32 s6, 0x1000 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1076,7 +1094,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -1087,44 +1105,12 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1132,31 +1118,31 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] ; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 @@ -1170,13 +1156,13 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1197,13 +1183,13 @@ ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 -; GISEL-NEXT: s_sub_u32 s11, 0, s8 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s8 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s12, 0, s9 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 @@ -1212,10 +1198,10 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 @@ -1242,39 +1228,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 @@ -1337,14 +1321,14 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 -; GISEL-NEXT: s_sub_u32 s8, 0, s6 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s6 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GISEL-NEXT: s_subb_u32 s9, 0, s7 +; GISEL-NEXT: s_subb_u32 s5, 0, s7 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 @@ -1353,13 +1337,13 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -1387,39 +1371,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 @@ -1487,32 +1469,31 @@ ; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_movk_i32 s6, 0xf000 +; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v4, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1531,40 +1512,39 @@ ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 @@ -1655,40 +1635,38 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 @@ -1758,32 +1736,62 @@ ; CHECK-LABEL: v_srem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_mov_b32 s4, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1791,7 +1799,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -1802,44 +1810,12 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1847,31 +1823,31 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] ; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 @@ -1885,13 +1861,13 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -1912,13 +1888,13 @@ ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 -; GISEL-NEXT: s_sub_u32 s11, 0, s8 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s8 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s12, 0, s9 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_subb_u32 s5, 0, s9 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 @@ -1927,10 +1903,10 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 @@ -1957,39 +1933,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 @@ -2052,14 +2026,14 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 -; GISEL-NEXT: s_sub_u32 s8, 0, s6 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s4, 0, s6 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GISEL-NEXT: s_subb_u32 s9, 0, s7 +; GISEL-NEXT: s_subb_u32 s5, 0, s7 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 @@ -2068,13 +2042,13 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -2102,39 +2076,37 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 @@ -2202,32 +2174,62 @@ ; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_mov_b32 s6, 0xffed2705 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v4, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2246,39 +2248,7 @@ ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 @@ -2370,40 +2340,38 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 @@ -2488,40 +2456,40 @@ ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 ; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v1 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v6, v6 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 ; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v1 +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CHECK-NEXT: v_mul_lo_u32 v10, v6, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 ; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v11 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v13, v6, v9 +; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 @@ -2529,102 +2497,100 @@ ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v9, vcc +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v7, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v2 ; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 -; CHECK-NEXT: v_mul_lo_u32 v11, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CHECK-NEXT: v_mul_hi_u32 v11, v2, v7 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v10 +; CHECK-NEXT: v_mul_lo_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v10 +; CHECK-NEXT: v_mul_hi_u32 v10, v5, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v6 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v9, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v4, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v4, v2, vcc +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v4, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v0 ; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v5 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v1 ; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v5 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v2, v2, v1 -; CHECK-NEXT: v_xor_b32_e32 v3, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_xor_b32_e32 v2, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v2, v6, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: BB7_2: ; %Flow @@ -2662,8 +2628,8 @@ ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 +; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc @@ -2675,108 +2641,106 @@ ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v8 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v16, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v14 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v12 -; GISEL-NEXT: v_mul_hi_u32 v1, v0, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v0, v7, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v12 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v15, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v7, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v1 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v10, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v0 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v14 -; GISEL-NEXT: v_mul_lo_u32 v13, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v1, v14 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v1, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v10 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v1, v1, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v12 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v10 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v11 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v9, v10 -; GISEL-NEXT: v_lshl_b64 v[0:1], s[6:7], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v12 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v0, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v1 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v7 +; GISEL-NEXT: v_lshl_b64 v[0:1], s[4:5], v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v16, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 ; GISEL-NEXT: v_mul_lo_u32 v7, v8, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 ; GISEL-NEXT: v_mul_hi_u32 v6, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v9, v11 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v16, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v6 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v16, v10 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v13, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v8 @@ -2816,115 +2780,113 @@ ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v9 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v9, vcc ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_xor_b32_e32 v3, v1, v9 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v1, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v3, v3 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v8 ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v3 ; GISEL-NEXT: v_mul_hi_u32 v15, v10, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v0 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v14 +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v14 ; GISEL-NEXT: v_mul_lo_u32 v15, v0, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v0, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v1, v14 +; GISEL-NEXT: v_mul_hi_u32 v1, v0, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v3, v14 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; GISEL-NEXT: v_mul_hi_u32 v15, v0, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_mul_hi_u32 v12, v1, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_mul_hi_u32 v12, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v1, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v10, v1 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v0 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v13, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v11 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v1, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v3 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v10 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v3 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v6, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v5, v2, v3 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v10 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v6, v16, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v8, v3 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v2, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v16, v6 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v2, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 ; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v4, v8 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] @@ -2938,8 +2900,8 @@ ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 ; GISEL-NEXT: v_xor_b32_e32 v4, v2, v9 @@ -2968,35 +2930,35 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 ; CGP-NEXT: v_xor_b32_e32 v0, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v9 -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v4 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 ; CGP-NEXT: v_mul_lo_u32 v14, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v16, v9, v2 ; CGP-NEXT: v_mul_lo_u32 v15, v9, v2 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v1 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_mul_lo_u32 v14, v8, v15 ; CGP-NEXT: v_mul_lo_u32 v16, v2, v13 ; CGP-NEXT: v_mul_hi_u32 v17, v2, v15 ; CGP-NEXT: v_mul_hi_u32 v15, v8, v15 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v1 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -3015,41 +2977,39 @@ ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v8, v13, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v15, v9, v14 -; CGP-NEXT: v_mul_lo_u32 v16, v9, v2 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v2 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v2 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 -; CGP-NEXT: v_mul_lo_u32 v15, v2, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v13, v14, v9 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v2, v9 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v8, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v9 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v6, v2 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v6, v2 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -3057,7 +3017,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v6, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v3, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 @@ -3069,42 +3029,42 @@ ; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v8 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v3, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v6, v2, vcc ; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v6, v2 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v4, v3 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v3, v1 ; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v3 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v1 -; CGP-NEXT: v_xor_b32_e32 v3, v0, v1 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_xor_b32_e32 v2, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB8_2: ; %Flow2 @@ -3143,40 +3103,40 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v2, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v2 -; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v3 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v7, v7 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v7 ; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v3 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 @@ -3184,102 +3144,100 @@ ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc ; CGP-NEXT: v_mul_lo_u32 v10, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v4 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v4 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v7, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v8 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v7, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v7, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v6, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v6, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v2 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v3 ; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v4, v3 -; CGP-NEXT: v_xor_b32_e32 v5, v2, v3 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v4, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_xor_b32_e32 v4, v2, v8 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v3, v8 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v4, v8, vcc ; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: BB8_6: ; %Flow @@ -3377,90 +3335,88 @@ ; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 ; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 -; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v4, v4 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v2 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v2 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v2 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v11 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v10 +; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v7, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v2, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v4, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v2 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v7, v2 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0, v5 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v2 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v4 ; GISEL-NEXT: v_mul_hi_u32 v9, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 @@ -3468,7 +3424,7 @@ ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -3480,8 +3436,8 @@ ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v5, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v2, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v13, v2 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 @@ -3492,11 +3448,9 @@ ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v1 ; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v6 @@ -3504,116 +3458,116 @@ ; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 0, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v4, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v10 +; GISEL-NEXT: v_mul_hi_u32 v5, v4, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v5, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v7, v0 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v11 -; GISEL-NEXT: v_mul_lo_u32 v10, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v5, v10 ; GISEL-NEXT: v_mul_lo_u32 v9, v4, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v0, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v4, v4, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v9 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v0, v8 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v4, v10 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v2, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_mul_lo_u32 v5, v6, v2 ; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 ; GISEL-NEXT: v_mul_lo_u32 v7, v3, v2 ; GISEL-NEXT: v_mul_hi_u32 v2, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v13, v7 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v2, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v14, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v11, v7 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -58,38 +58,36 @@ ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v0 ; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_mul_hi_u32 v9, v6, v0 +; CHECK-NEXT: v_mul_lo_u32 v6, v6, v1 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v10, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v13, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v6 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: v_mul_lo_u32 v6, v5, v0 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 @@ -195,24 +193,24 @@ ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3 -; CHECK-NEXT: s_sub_u32 s6, 0, s2 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 +; CHECK-NEXT: s_sub_u32 s4, 0, s2 +; CHECK-NEXT: s_cselect_b32 s5, 1, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2 -; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: s_and_b32 s5, s5, 1 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_subb_u32 s7, 0, s3 +; CHECK-NEXT: s_cmp_lg_u32 s5, 0 +; CHECK-NEXT: s_subb_u32 s5, 0, s3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v2 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, s6, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v0 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v0 +; CHECK-NEXT: v_mul_lo_u32 v4, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, s4, v0 +; CHECK-NEXT: v_mul_lo_u32 v6, s5, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, s4, v0 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CHECK-NEXT: v_mul_lo_u32 v6, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5 @@ -237,38 +235,36 @@ ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v2, v4, vcc -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v0 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v0 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v9 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, s4, v0 +; CHECK-NEXT: v_mul_lo_u32 v5, s5, v0 +; CHECK-NEXT: v_mul_hi_u32 v6, s4, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v0, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v4, s1, v0 ; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s1, v0 @@ -409,38 +405,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v13, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 ; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, 0, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 ; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 @@ -536,38 +530,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v11, vcc -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v8 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v5, vcc, 0, v5, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 @@ -680,38 +672,36 @@ ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v1, v12, vcc -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v2, v0 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v0 -; CGP-NEXT: v_mul_lo_u32 v2, v2, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v12 -; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v3, v2 -; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v15, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v13, v3 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v15 -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v12, v3 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v2 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 ; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 @@ -844,38 +834,36 @@ ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v3, v10, vcc -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v10 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_mul_lo_u32 v10, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v13, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v10 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v2 ; CGP-NEXT: v_mul_hi_u32 v5, v8, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v9, v2 @@ -1130,38 +1118,36 @@ ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v0 ; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v0 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v10 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v13, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v9, v2 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v8 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 ; CHECK-NEXT: v_mul_hi_u32 v7, v3, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 @@ -1299,38 +1285,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v13, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v6 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 ; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, 0, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 @@ -1426,38 +1410,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v11, vcc -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v8 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, 0, v7, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 ; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 @@ -1573,38 +1555,36 @@ ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v1, v12, vcc -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v4, v0 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v12 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v14 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v0 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; CGP-NEXT: v_mul_lo_u32 v6, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v16 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v15 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v4 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 ; CGP-NEXT: v_mul_hi_u32 v6, v8, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 @@ -1737,38 +1717,36 @@ ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v3, v8, vcc -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v8 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v9, v8 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v9, v4 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v7, v2 ; CGP-NEXT: v_mul_hi_u32 v6, v5, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v7, v2 @@ -1987,171 +1965,167 @@ ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 -; GISEL-NEXT: s_bfe_i32 s10, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s8, -1, 0x10000 ; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 ; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mov_b32_e32 v16, s4 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_mov_b32_e32 v19, s5 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mov_b32_e32 v15, s7 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v4, v6 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 -; GISEL-NEXT: v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v19, v16, v17 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v17 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v18, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v15 -; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v15 -; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v4, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 ; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v18, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v10, v5 -; GISEL-NEXT: v_mov_b32_e32 v5, s10 -; GISEL-NEXT: v_mov_b32_e32 v10, s11 -; GISEL-NEXT: v_add_i32_e64 v8, s[10:11], v8, v12 -; GISEL-NEXT: v_mov_b32_e32 v12, s12 -; GISEL-NEXT: v_add_i32_e64 v11, s[10:11], v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 -; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v16, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v16, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v15, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 -; GISEL-NEXT: v_mov_b32_e32 v19, s13 -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v17 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v14, v13 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v16, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v14, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v11, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v16, v1, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v3, v7 -; GISEL-NEXT: v_mul_lo_u32 v17, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v3, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v17, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], 1, v7 -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v8, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, s[8:9], 0, v11, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 1, v7 +; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v6, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[8:9], 0, v10, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v13, v16, v13, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], 1, v15 -; GISEL-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v17, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v11 -; GISEL-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v15, v16, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], 1, v12 +; GISEL-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v14, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v10 +; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[6:7] ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 -; GISEL-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 1, v13 -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], 0, v18, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 1, v8 +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], 0, v17, s[6:7] ; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], 0, v9 ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -2160,21 +2134,21 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v19, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v15, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v16, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v8, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -116,14 +116,14 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX8-NEXT: s_sub_u32 s2, 0, s10 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_sub_u32 s0, 0, s10 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s0, s0, 1 -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s3, 0, s11 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_subb_u32 s1, 0, s11 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -131,15 +131,44 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s2, v0 -; GFX8-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v7, v0, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 @@ -160,38 +189,7 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, s2, v0 -; GFX8-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v5 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v8 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v5, s8, v0 @@ -237,25 +235,26 @@ ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s10, v7 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 -; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s10, v7 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -269,14 +268,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX9-NEXT: s_sub_u32 s2, 0, s10 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9-NEXT: s_sub_u32 s0, 0, s10 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s3, 0, s11 +; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_subb_u32 s1, 0, s11 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -284,10 +283,10 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -310,37 +309,35 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: v_add3_u32 v4, v4, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v5, s8, v0 @@ -396,13 +393,14 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] ; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm @@ -413,12 +411,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX10-NEXT: s_sub_u32 s1, 0, s10 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: s_sub_u32 s0, 0, s10 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_and_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: s_subb_u32 s2, 0, s11 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_subb_u32 s1, 0, s11 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -427,11 +425,11 @@ ; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX10-NEXT: v_mul_lo_u32 v2, s1, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s2, v0 -; GFX10-NEXT: v_mul_hi_u32 v4, s1, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX10-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s0, v0 ; GFX10-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v1, v5 @@ -440,6 +438,33 @@ ; GFX10-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX10-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX10-NEXT: v_add_co_u32 v3, s2, v3, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v6, s2, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v3, s2, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v5, s2, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v7, v6 +; GFX10-NEXT: v_add_co_u32 v3, s2, v5, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX10-NEXT: v_mul_lo_u32 v4, s0, v1 +; GFX10-NEXT: v_mul_hi_u32 v6, v1, v5 +; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, v1, v2 +; GFX10-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v6 @@ -454,38 +479,9 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v4, s2, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, s1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v1, v2, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v7, s1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v7 -; GFX10-NEXT: v_add3_u32 v4, v4, v6, v5 -; GFX10-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX10-NEXT: v_mul_hi_u32 v7, v0, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX10-NEXT: v_mul_lo_u32 v9, v3, v4 -; GFX10-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX10-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v8, s0, v9, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v7, s0, v8, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v9, v8 -; GFX10-NEXT: v_add_co_u32 v5, s0, v7, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add3_u32 v2, v4, v6, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, s9, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX10-NEXT: v_mul_hi_u32 v4, s9, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX10-NEXT: v_mul_lo_u32 v5, s9, v1 @@ -504,49 +500,50 @@ ; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v5, s10, v0 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1 ; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX10-NEXT: v_mul_lo_u32 v4, s10, v1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v0, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s9, v2 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v2 ; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s8, v5 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s0, s9, v2, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v6, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s0, s9, v2, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v8, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v5, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v7 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v5, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v9 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s11, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v13, s0, v3, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v7 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v8, s0 -; GFX10-NEXT: global_store_dwordx2 v9, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v9, v[2:3], s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v8, v12, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v13, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v10, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v12 +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v6, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, v7, s1 +; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv i64 %x, %y store i64 %div, i64 addrspace(1)* %out0 @@ -1037,14 +1034,14 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX8-NEXT: s_sub_u32 s2, 0, s8 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_sub_u32 s0, 0, s8 +; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s0, s0, 1 -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s3, 0, s9 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_subb_u32 s1, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 @@ -1052,15 +1049,45 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s2, v0 -; GFX8-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX8-NEXT: s_sub_u32 s2, 0, s10 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4 +; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v7, v0, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 @@ -1081,39 +1108,7 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, s2, v0 -; GFX8-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v5 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v8 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX8-NEXT: s_sub_u32 s2, 0, s10 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1 ; GFX8-NEXT: v_mul_hi_u32 v5, s12, v0 @@ -1168,31 +1163,61 @@ ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s11 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v9, s10 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v7 -; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX8-NEXT: v_trunc_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v6, 0xcf800000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v13, s11 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v12, s10 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX8-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v9, v4 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX8-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v9 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 +; GFX8-NEXT: v_trunc_f32_e32 v6, v6 +; GFX8-NEXT: v_mul_f32_e32 v7, 0xcf800000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_subb_u32 s3, 0, s11 +; GFX8-NEXT: v_mul_lo_u32 v7, s3, v3 +; GFX8-NEXT: v_mul_lo_u32 v8, s2, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX8-NEXT: v_mul_hi_u32 v10, s2, v3 +; GFX8-NEXT: v_mul_lo_u32 v9, s2, v3 +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v10 +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v9 +; GFX8-NEXT: v_mul_lo_u32 v10, v3, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc +; GFX8-NEXT: v_mul_hi_u32 v2, v3, v9 +; GFX8-NEXT: v_mul_hi_u32 v9, v6, v9 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_mul_hi_u32 v10, v3, v7 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v10 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v6, v7, vcc ; GFX8-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX8-NEXT: v_mul_lo_u32 v7, s2, v3 ; GFX8-NEXT: v_mul_hi_u32 v9, s2, v2 @@ -1222,38 +1247,7 @@ ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 -; GFX8-NEXT: v_addc_u32_e64 v7, s[0:1], v3, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, s3, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, s2, v7 -; GFX8-NEXT: v_mul_hi_u32 v12, s2, v2 -; GFX8-NEXT: v_mul_lo_u32 v11, s2, v2 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 -; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v9 -; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v12 -; GFX8-NEXT: v_mul_lo_u32 v9, v7, v11 -; GFX8-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX8-NEXT: v_mul_hi_u32 v6, v2, v11 -; GFX8-NEXT: v_mul_hi_u32 v11, v7, v11 -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v9, v7, v8 -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v12, v6 -; GFX8-NEXT: v_mul_hi_u32 v12, v2, v8 -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v11, s[0:1], v11, v12 -; GFX8-NEXT: v_mul_hi_u32 v7, v7, v8 -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v11, v9 -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v6, s15, v2 ; GFX8-NEXT: v_mul_lo_u32 v7, s14, v3 ; GFX8-NEXT: v_mul_hi_u32 v9, s14, v2 @@ -1333,14 +1327,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX9-NEXT: s_sub_u32 s2, 0, s8 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s3, 0, s9 +; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -1349,11 +1343,13 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s11 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 +; GFX9-NEXT: s_sub_u32 s2, 0, s10 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 ; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -1376,41 +1372,40 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v5, v2 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1431,7 +1426,6 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, s8, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s12, v6 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v7, v2, vcc @@ -1450,8 +1444,8 @@ ; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_add_f32_e32 v5, v14, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] @@ -1459,106 +1453,104 @@ ; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s8, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 ; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v9 -; GFX9-NEXT: v_trunc_f32_e32 v11, v11 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; GFX9-NEXT: v_mul_f32_e32 v12, 0xcf800000, v11 +; GFX9-NEXT: v_mul_f32_e32 v12, 0x2f800000, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX9-NEXT: v_add_f32_e32 v5, v12, v5 -; GFX9-NEXT: s_sub_u32 s8, 0, s10 +; GFX9-NEXT: v_trunc_f32_e32 v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_mul_f32_e32 v13, 0xcf800000, v12 +; GFX9-NEXT: v_add_f32_e32 v5, v13, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s9, 0, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX9-NEXT: v_mul_lo_u32 v13, s9, v5 -; GFX9-NEXT: v_mul_lo_u32 v14, s8, v11 -; GFX9-NEXT: v_mul_hi_u32 v16, s8, v5 -; GFX9-NEXT: v_mul_lo_u32 v12, s8, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] +; GFX9-NEXT: s_subb_u32 s3, 0, s11 +; GFX9-NEXT: v_mul_lo_u32 v13, s3, v5 +; GFX9-NEXT: v_mul_lo_u32 v14, s2, v12 +; GFX9-NEXT: v_mul_hi_u32 v16, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v17, s2, v5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: v_add3_u32 v4, v13, v14, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v9, v11, v12 -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v4 -; GFX9-NEXT: v_mul_hi_u32 v13, v5, v12 -; GFX9-NEXT: v_mul_hi_u32 v12, v11, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX9-NEXT: v_add_co_u32_e64 v9, s[2:3], v9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[2:3], v9, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v13, v11, v4 -; GFX9-NEXT: v_add_u32_e32 v9, v10, v9 -; GFX9-NEXT: v_mul_hi_u32 v10, v5, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v11, v4 -; GFX9-NEXT: v_add_co_u32_e64 v12, s[2:3], v13, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v10, s[2:3], v12, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[2:3], v10, v9 -; GFX9-NEXT: v_add_u32_e32 v12, v13, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v10, v12, v10, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v10, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v12, s9, v5 -; GFX9-NEXT: v_mul_lo_u32 v13, s8, v9 -; GFX9-NEXT: v_mul_hi_u32 v14, s8, v5 -; GFX9-NEXT: v_mul_lo_u32 v15, s8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v3, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_add3_u32 v3, v12, v13, v14 -; GFX9-NEXT: v_mul_lo_u32 v7, v9, v15 -; GFX9-NEXT: v_mul_lo_u32 v8, v5, v3 -; GFX9-NEXT: v_add_u32_e32 v10, v11, v10 -; GFX9-NEXT: v_mul_hi_u32 v11, v5, v15 -; GFX9-NEXT: v_mul_hi_u32 v12, v9, v15 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v11, v9, v3 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, v5, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v11, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_add_u32_e32 v11, v12, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v11, v8, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v10, v3, s[2:3] -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v5, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s15, v7 -; GFX9-NEXT: v_mul_lo_u32 v9, s14, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v2, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v2, s14, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, s15, v7 +; GFX9-NEXT: v_mul_lo_u32 v9, v12, v17 +; GFX9-NEXT: v_mul_lo_u32 v13, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v5, v17 +; GFX9-NEXT: v_mul_hi_u32 v14, v12, v17 +; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4 +; GFX9-NEXT: v_add_u32_e32 v9, v13, v9 +; GFX9-NEXT: v_mul_hi_u32 v13, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], v10, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], v10, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v10, v9 +; GFX9-NEXT: v_add_u32_e32 v13, v14, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] +; GFX9-NEXT: v_add3_u32 v4, v13, v10, v4 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], v12, v4, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v5 +; GFX9-NEXT: v_mul_lo_u32 v12, s2, v9 +; GFX9-NEXT: v_mul_hi_u32 v13, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v10, s2, v5 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX9-NEXT: v_add3_u32 v8, v4, v12, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v11, v9, v10 +; GFX9-NEXT: v_mul_lo_u32 v12, v5, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v7, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v5, v10 +; GFX9-NEXT: v_mul_hi_u32 v10, v9, v10 +; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v11, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v7, v9, v8 +; GFX9-NEXT: v_add_u32_e32 v3, v11, v3 +; GFX9-NEXT: v_mul_hi_u32 v11, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v8, v9, v8 +; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v7, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v7, v3 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add3_u32 v7, v10, v7, v8 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v5, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v9, v7, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s15, v3 +; GFX9-NEXT: v_mul_lo_u32 v9, s14, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v2, vcc +; GFX9-NEXT: v_mul_hi_u32 v2, s14, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s15, v3 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, s15, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, s15, v7 ; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, s14, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s15, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v8, s14, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s15, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v6, v3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v6, v3, v7 ; GFX9-NEXT: v_mul_lo_u32 v6, s11, v2 ; GFX9-NEXT: v_mul_lo_u32 v7, s10, v3 ; GFX9-NEXT: v_mul_hi_u32 v8, s10, v2 @@ -1590,7 +1582,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s10, v11 @@ -1610,16 +1601,13 @@ ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s10 -; GFX10-NEXT: s_sub_u32 s2, 0, s8 +; GFX10-NEXT: s_sub_u32 s6, 0, s8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 @@ -1627,14 +1615,14 @@ ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v2, v3 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: s_subb_u32 s1, 0, s9 +; GFX10-NEXT: s_subb_u32 s7, 0, s9 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_sub_u32 s3, 0, s10 +; GFX10-NEXT: s_sub_u32 s12, 0, s10 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: s_and_b32 s0, s0, 1 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: s_subb_u32 s6, 0, s11 +; GFX10-NEXT: s_subb_u32 s13, 0, s11 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 @@ -1647,16 +1635,16 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX10-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX10-NEXT: v_mul_lo_u32 v8, s3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s6, v2 +; GFX10-NEXT: v_mul_lo_u32 v8, s12, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX10-NEXT: v_mul_hi_u32 v6, s2, v0 -; GFX10-NEXT: v_mul_lo_u32 v9, s6, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s3, v1 -; GFX10-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX10-NEXT: v_mul_hi_u32 v6, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, s13, v1 +; GFX10-NEXT: v_mul_hi_u32 v10, s12, v1 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s12, v1 ; GFX10-NEXT: v_add3_u32 v4, v5, v4, v6 ; GFX10-NEXT: v_add3_u32 v8, v9, v8, v10 ; GFX10-NEXT: v_mul_lo_u32 v5, v2, v7 @@ -1699,191 +1687,192 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 ; GFX10-NEXT: v_add3_u32 v4, v7, v6, v4 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: v_add3_u32 v5, v11, v10, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v2, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v9 -; GFX10-NEXT: v_mul_lo_u32 v8, s1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s1, v3, v5, s0 -; GFX10-NEXT: v_mul_hi_u32 v9, s2, v0 -; GFX10-NEXT: v_mul_lo_u32 v11, s2, v6 -; GFX10-NEXT: v_mul_lo_u32 v13, s6, v1 -; GFX10-NEXT: v_mul_hi_u32 v14, s3, v1 -; GFX10-NEXT: v_mul_lo_u32 v15, s3, v10 -; GFX10-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX10-NEXT: v_mul_lo_u32 v12, s3, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v5 -; GFX10-NEXT: v_add3_u32 v8, v8, v11, v9 -; GFX10-NEXT: v_add3_u32 v13, v13, v15, v14 -; GFX10-NEXT: v_mul_lo_u32 v16, v6, v7 -; GFX10-NEXT: v_mul_lo_u32 v14, v0, v8 -; GFX10-NEXT: v_mul_hi_u32 v17, v0, v7 -; GFX10-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX10-NEXT: v_mul_lo_u32 v15, v6, v8 -; GFX10-NEXT: v_mul_lo_u32 v9, v10, v12 -; GFX10-NEXT: v_mul_hi_u32 v18, v0, v8 -; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8 -; GFX10-NEXT: v_mul_lo_u32 v8, v1, v13 -; GFX10-NEXT: v_add_co_u32 v14, s1, v16, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v7, s1, v15, v7 -; GFX10-NEXT: v_mul_hi_u32 v11, v1, v12 -; GFX10-NEXT: v_mul_hi_u32 v12, v10, v12 -; GFX10-NEXT: v_mul_lo_u32 v19, v10, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v8, s1, v9, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v14, s1, v14, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v12, s1, v19, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v7, s1, v7, v18 -; GFX10-NEXT: v_add_nc_u32_e32 v14, v16, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v7, s1, v7, v14 -; GFX10-NEXT: v_mul_hi_u32 v20, v1, v13 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v15, v18 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v9, v8 -; GFX10-NEXT: v_add3_u32 v4, v11, v14, v6 -; GFX10-NEXT: v_add_co_u32 v12, s1, v12, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s1 -; GFX10-NEXT: v_mul_hi_u32 v6, v10, v13 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s1, v12, v8 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v17, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v6, s7, v0 +; GFX10-NEXT: v_mul_hi_u32 v7, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s6, v2 +; GFX10-NEXT: v_mul_lo_u32 v9, s13, v1 +; GFX10-NEXT: v_mul_hi_u32 v10, s12, v1 +; GFX10-NEXT: v_mul_lo_u32 v11, s12, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s12, v1 +; GFX10-NEXT: v_add3_u32 v5, v6, v5, v7 +; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10 +; GFX10-NEXT: v_mul_lo_u32 v12, v2, v4 +; GFX10-NEXT: v_mul_lo_u32 v10, v0, v5 +; GFX10-NEXT: v_mul_hi_u32 v13, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX10-NEXT: v_mul_lo_u32 v11, v2, v5 +; GFX10-NEXT: v_mul_lo_u32 v6, v3, v8 +; GFX10-NEXT: v_mul_lo_u32 v15, v1, v9 +; GFX10-NEXT: v_mul_hi_u32 v7, v1, v8 +; GFX10-NEXT: v_mul_hi_u32 v8, v3, v8 +; GFX10-NEXT: v_add_co_u32 v10, s6, v12, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v11, v4 +; GFX10-NEXT: v_mul_lo_u32 v16, v3, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v15 +; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v10, s6, v10, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v8, s6, v16, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v4, v14 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v12, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v7 +; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v4, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s6 +; GFX10-NEXT: v_mul_hi_u32 v17, v1, v9 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v15, v6 +; GFX10-NEXT: v_add3_u32 v5, v7, v10, v5 +; GFX10-NEXT: v_mul_hi_u32 v9, v3, v9 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 -; GFX10-NEXT: v_add3_u32 v5, v9, v8, v6 -; GFX10-NEXT: v_mul_lo_u32 v6, s17, v0 -; GFX10-NEXT: v_mul_lo_u32 v7, s16, v2 -; GFX10-NEXT: v_mul_hi_u32 v8, s17, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s16, v0 -; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, vcc_lo, v3, v5, s0 -; GFX10-NEXT: v_mul_hi_u32 v5, s16, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s17, v2 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v8, s0, v9, v8 -; GFX10-NEXT: v_add_co_u32 v0, s1, v6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v8, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v7, v0 +; GFX10-NEXT: v_add_co_u32 v8, s6, v8, v17 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s12, v8, v6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mul_lo_u32 v6, s1, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v11 +; GFX10-NEXT: v_mul_hi_u32 v11, s1, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s12 +; GFX10-NEXT: v_mul_lo_u32 v12, s1, v2 ; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 -; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1 -; GFX10-NEXT: v_mul_lo_u32 v8, s9, v0 +; GFX10-NEXT: v_add_co_u32 v6, s12, v6, v8 +; GFX10-NEXT: v_add3_u32 v5, v7, v5, v9 +; GFX10-NEXT: v_mul_hi_u32 v7, s0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v0, s13, v6, v0 +; GFX10-NEXT: v_add_co_u32 v9, s12, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12 +; GFX10-NEXT: v_mul_hi_u32 v2, s1, v2 +; GFX10-NEXT: v_add_co_u32 v7, s12, v9, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_add_co_u32 v0, s12, v7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s12 +; GFX10-NEXT: v_mul_lo_u32 v6, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v8, s2, v3 +; GFX10-NEXT: v_mul_lo_u32 v9, s9, v0 +; GFX10-NEXT: v_mul_hi_u32 v11, s8, v0 ; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v4, s18, v3 -; GFX10-NEXT: v_mul_lo_u32 v5, s19, v3 -; GFX10-NEXT: v_mul_hi_u32 v9, s8, v0 +; GFX10-NEXT: v_mul_hi_u32 v7, s2, v1 ; GFX10-NEXT: v_mul_lo_u32 v13, s8, v0 -; GFX10-NEXT: v_mul_lo_u32 v11, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v12, s18, v3 -; GFX10-NEXT: v_mul_hi_u32 v3, s19, v3 -; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4 -; GFX10-NEXT: v_add_co_u32 v1, s1, v5, v1 -; GFX10-NEXT: v_add3_u32 v5, v8, v11, v9 +; GFX10-NEXT: v_mul_hi_u32 v1, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v4, s3, v3 +; GFX10-NEXT: v_mul_lo_u32 v12, s8, v2 +; GFX10-NEXT: v_add_co_u32 v6, s12, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v5, s2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s12 +; GFX10-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX10-NEXT: v_add_co_u32 v1, s12, v4, v1 +; GFX10-NEXT: v_add3_u32 v9, v9, v12, v11 +; GFX10-NEXT: v_sub_co_u32 v11, vcc_lo, s0, v13 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, s1, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, s16, v13 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s17, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v5, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s0, s1, v9, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v11 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v11, v9, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v17, s0, v0, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v11, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v16, v12 -; GFX10-NEXT: v_add_co_u32 v12, s0, v1, v4 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v8, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v11, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v9 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v12, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v17, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v18, s0 -; GFX10-NEXT: v_add3_u32 v3, v11, v1, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v7, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: v_mul_lo_u32 v6, s11, v12 -; GFX10-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX10-NEXT: v_mul_hi_u32 v11, s10, v12 -; GFX10-NEXT: v_sub_co_u32 v19, s0, v13, s8 -; GFX10-NEXT: v_mul_lo_u32 v16, s10, v12 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v20, s0, 0, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v18, v15, vcc_lo -; GFX10-NEXT: v_add3_u32 v6, v6, v7, v11 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v20, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v7, s1, s18, v16 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s2, s19, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, v4, s0 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s19, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s11, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s11, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v14, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, s11, v2, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s10, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s1 -; GFX10-NEXT: v_sub_co_u32 v13, s1, v7, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s11, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v16, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, 0, v17, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, vcc_lo, s11, v2, s1 +; GFX10-NEXT: v_add_co_u32 v5, s0, v0, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v15 +; GFX10-NEXT: v_add3_u32 v3, v4, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v18, s10, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0 +; GFX10-NEXT: v_mul_lo_u32 v13, s11, v6 +; GFX10-NEXT: v_mul_lo_u32 v17, s10, v3 +; GFX10-NEXT: v_add_co_u32 v1, s0, v5, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v16, s0 +; GFX10-NEXT: v_sub_co_u32 v19, s0, v14, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v5, s10, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo +; GFX10-NEXT: v_add3_u32 v13, v13, v17, v18 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v13 +; GFX10-NEXT: v_sub_co_u32 v12, s0, s2, v5 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s3, v13, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s11, v2, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v12 +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v8 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s11, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 +; GFX10-NEXT: v_sub_co_u32 v13, s0, v12, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v19, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s2, 0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s11, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s11, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s10, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s1 +; GFX10-NEXT: v_add_co_u32 v15, s1, v6, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s1, 0, v3, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v11, s1 +; GFX10-NEXT: v_add_co_u32 v11, s1, v15, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s1, 0, v17, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 ; GFX10-NEXT: v_sub_co_u32 v8, s1, v13, s10 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v13, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v12, v11, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v16, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v11, v15, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v17, v18, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v8, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v13, s1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, <2 x i64> addrspace(1)* %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -58,38 +58,36 @@ ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v0 ; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_mul_hi_u32 v9, v6, v0 +; CHECK-NEXT: v_mul_lo_u32 v6, v6, v1 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v10, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v13, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v6 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: v_mul_lo_u32 v6, v5, v0 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 @@ -192,24 +190,24 @@ ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3 -; CHECK-NEXT: s_sub_u32 s6, 0, s2 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 +; CHECK-NEXT: s_sub_u32 s4, 0, s2 +; CHECK-NEXT: s_cselect_b32 s5, 1, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2 -; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: s_and_b32 s5, s5, 1 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_subb_u32 s7, 0, s3 +; CHECK-NEXT: s_cmp_lg_u32 s5, 0 +; CHECK-NEXT: s_subb_u32 s5, 0, s3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v2 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, s6, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v0 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v0 +; CHECK-NEXT: v_mul_lo_u32 v4, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, s4, v0 +; CHECK-NEXT: v_mul_lo_u32 v6, s5, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, s4, v0 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CHECK-NEXT: v_mul_lo_u32 v6, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5 @@ -234,38 +232,36 @@ ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v2, v4, vcc -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v0 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v0 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v9 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, s4, v0 +; CHECK-NEXT: v_mul_lo_u32 v5, s5, v0 +; CHECK-NEXT: v_mul_hi_u32 v6, s4, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v0, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v4, s1, v0 ; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s1, v0 @@ -403,38 +399,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v13, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 ; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, 0, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 ; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 @@ -529,38 +523,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v11, vcc -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v8 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v5, vcc, 0, v5, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 @@ -672,38 +664,36 @@ ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v1, v12, vcc -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v2, v0 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v0 -; CGP-NEXT: v_mul_lo_u32 v2, v2, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v12 -; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v3, v2 -; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v15, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v13, v3 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v15 -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v12, v3 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v2 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 ; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 @@ -833,38 +823,36 @@ ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v3, v10, vcc -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v10 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_mul_lo_u32 v10, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v13, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v10 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v2 ; CGP-NEXT: v_mul_hi_u32 v5, v8, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v9, v2 @@ -982,13 +970,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: s_mov_b32 s7, 0x12d8fb -; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CHECK-NEXT: s_mov_b32 s4, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb ; CHECK-NEXT: s_bfe_i32 s5, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: v_mov_b32_e32 v4, s5 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_mov_b32_e32 v4, s7 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 @@ -996,10 +984,10 @@ ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 ; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v9, s4, v2 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_mul_lo_u32 v8, v5, v7 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 @@ -1024,38 +1012,36 @@ ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_addc_u32_e64 v7, s[4:5], v5, v6, vcc -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, s6, v7 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v6 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v8 -; CHECK-NEXT: v_mul_hi_u32 v13, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v7, v8 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v10, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, s4, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v7 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v7 +; CHECK-NEXT: v_mul_hi_u32 v12, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 ; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 @@ -1076,28 +1062,28 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v8, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, s7, v5 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v5 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s7, v0 +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, s7, v3 +; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, s6, v3 ; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -1114,9 +1100,9 @@ ; GISEL-LABEL: v_urem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s12 -; GISEL-NEXT: s_sub_u32 s8, 0, s12 +; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 +; GISEL-NEXT: s_sub_u32 s6, 0, s8 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; GISEL-NEXT: v_mov_b32_e32 v6, v4 @@ -1126,250 +1112,246 @@ ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s9, 0, 0 -; GISEL-NEXT: s_bfe_i32 s10, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; GISEL-NEXT: s_sub_u32 s13, 0, s12 +; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mov_b32_e32 v5, s4 +; GISEL-NEXT: v_mov_b32_e32 v4, s5 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; GISEL-NEXT: s_sub_u32 s9, 0, s8 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mul_lo_u32 v8, s13, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, s8, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; GISEL-NEXT: s_subb_u32 s10, 0, 0 +; GISEL-NEXT: v_mul_lo_u32 v10, s9, v8 +; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, s9, v6 +; GISEL-NEXT: v_mov_b32_e32 v15, s4 +; GISEL-NEXT: v_mul_lo_u32 v16, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v17, s7, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, s6, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v19, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, s9, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 +; GISEL-NEXT: v_mul_hi_u32 v16, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v17, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v19, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, s13, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v15, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, s8, v13 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v15 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v5, v15 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v16 -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; GISEL-NEXT: v_mul_lo_u32 v17, s13, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; GISEL-NEXT: v_mov_b32_e32 v14, s10 -; GISEL-NEXT: v_mov_b32_e32 v17, s11 -; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s14, -1, 0x10000 -; GISEL-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v8, s13 -; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v5, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; GISEL-NEXT: v_mov_b32_e32 v19, s14 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v16, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v2, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v16, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, s12, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, s12, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v6, s12, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v15, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; GISEL-NEXT: v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 +; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v13 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v1, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v7 +; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v15, v6, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, s12, v2 +; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GISEL-NEXT: v_subrev_i32_e32 v10, vcc, s12, v0 +; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, s8, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc -; GISEL-NEXT: v_subrev_i32_e32 v12, vcc, s12, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v19, v10, vcc +; GISEL-NEXT: v_subrev_i32_e32 v13, vcc, s8, v7 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GISEL-NEXT: v_subrev_i32_e32 v14, vcc, s12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GISEL-NEXT: v_subrev_i32_e32 v12, vcc, s8, v11 ; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v10, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_oddk_denom: @@ -1377,250 +1359,246 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s8, 0xffed2705 -; CGP-NEXT: s_mov_b32 s12, 0x12d8fb -; CGP-NEXT: s_bfe_i32 s10, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s11, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s13, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s14, -1, 0x10000 +; CGP-NEXT: s_mov_b32 s6, 0xffed2705 +; CGP-NEXT: s_mov_b32 s8, 0x12d8fb +; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s5, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s7, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s9, -1, 0x10000 ; CGP-NEXT: v_mov_b32_e32 v6, v4 +; CGP-NEXT: v_mov_b32_e32 v7, s4 +; CGP-NEXT: v_mov_b32_e32 v8, s5 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_mov_b32_e32 v9, s7 ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 +; CGP-NEXT: v_mul_lo_u32 v13, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v14, -1, v4 +; CGP-NEXT: v_mul_hi_u32 v15, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v16, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v17, -1, v5 +; CGP-NEXT: v_mul_hi_u32 v18, s6, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v19, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v16 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; CGP-NEXT: v_mul_lo_u32 v18, v5, v12 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_lo_u32 v15, v4, v11 +; CGP-NEXT: v_mul_lo_u32 v17, v6, v11 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v11 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 +; CGP-NEXT: v_mul_lo_u32 v19, v10, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; CGP-NEXT: v_mul_hi_u32 v18, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v16 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v11, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v12, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v14, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v17, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v18, s8, v13 -; CGP-NEXT: v_mul_lo_u32 v19, v13, v15 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; CGP-NEXT: v_mul_hi_u32 v18, v5, v15 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v5, v16 -; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; CGP-NEXT: v_mul_lo_u32 v17, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v4, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v4, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; CGP-NEXT: v_mov_b32_e32 v14, s10 -; CGP-NEXT: v_mov_b32_e32 v17, s11 -; CGP-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; CGP-NEXT: v_mov_b32_e32 v8, s13 -; CGP-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v12 -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; CGP-NEXT: v_mul_lo_u32 v19, v13, v16 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v5, v16 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; CGP-NEXT: v_mov_b32_e32 v19, s14 -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v17 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v11, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v13, -1, v4 +; CGP-NEXT: v_mul_hi_u32 v14, s6, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v15, -1, v5 +; CGP-NEXT: v_mul_hi_u32 v16, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v17, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v18, v6, v11 +; CGP-NEXT: v_mul_hi_u32 v19, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; CGP-NEXT: v_mul_lo_u32 v17, s6, v10 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; CGP-NEXT: v_mul_lo_u32 v16, v5, v15 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v13 +; CGP-NEXT: v_mul_lo_u32 v16, v6, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 +; CGP-NEXT: v_mul_lo_u32 v19, v10, v15 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; CGP-NEXT: v_mul_hi_u32 v17, v5, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v19, v12 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; CGP-NEXT: v_mov_b32_e32 v19, s9 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v1, v5 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v13, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v16, v2, v6 +; CGP-NEXT: v_mul_lo_u32 v15, v2, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v3, v6 +; CGP-NEXT: v_mul_hi_u32 v17, v2, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v18, v0, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; CGP-NEXT: v_mul_lo_u32 v18, v0, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v11, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v1, v10 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v16, v4 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_lo_u32 v10, s12, v4 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s12, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, s12, v5 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, s12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v6, s12, v6 -; CGP-NEXT: v_mul_lo_u32 v7, s12, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v17 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v17 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v13, s8, v4 +; CGP-NEXT: v_mul_lo_u32 v15, 0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, s8, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_mul_lo_u32 v14, s8, v5 +; CGP-NEXT: v_mul_lo_u32 v16, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, s8, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v6, s8, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s8, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 ; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; CGP-NEXT: v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; CGP-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v5, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v5 -; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 +; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7] ; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; CGP-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, s12, v2 +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v8 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; CGP-NEXT: v_subrev_i32_e32 v10, vcc, s12, v0 +; CGP-NEXT: v_subrev_i32_e32 v11, vcc, s8, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc -; CGP-NEXT: v_subrev_i32_e32 v12, vcc, s12, v8 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_subrev_i32_e32 v13, vcc, s8, v7 +; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; CGP-NEXT: v_subrev_i32_e32 v14, vcc, s12, v10 +; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc +; CGP-NEXT: v_subrev_i32_e32 v12, vcc, s8, v11 ; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, v10, v14, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, v11, v12, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[4:5] ; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v10, v1, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, @@ -1683,38 +1661,36 @@ ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v0 ; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v0 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v10 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v13, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v9, v2 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v8 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 ; CHECK-NEXT: v_mul_hi_u32 v7, v3, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 @@ -1849,38 +1825,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v13, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v6 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 ; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, 0, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 @@ -1975,38 +1949,36 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v11, vcc -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v8 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, 0, v7, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 ; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 @@ -2121,38 +2093,36 @@ ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v1, v12, vcc -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v4, v0 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v12 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v14 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v0 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; CGP-NEXT: v_mul_lo_u32 v6, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v16 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v15 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v4 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 ; CGP-NEXT: v_mul_hi_u32 v6, v8, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 @@ -2282,38 +2252,36 @@ ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v3, v8, vcc -; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v8 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v9, v8 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v9, v4 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v7, v2 ; CGP-NEXT: v_mul_hi_u32 v6, v5, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v7, v2 @@ -2529,190 +2497,186 @@ ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 -; GISEL-NEXT: s_bfe_i32 s10, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s8, -1, 0x10000 ; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 ; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mov_b32_e32 v16, s4 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_mov_b32_e32 v19, s5 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mov_b32_e32 v15, s7 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v4, v6 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 -; GISEL-NEXT: v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v19, v16, v17 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v17 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v18, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v15 -; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v15 -; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v4, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 ; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v18, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v10, v5 -; GISEL-NEXT: v_mov_b32_e32 v5, s10 -; GISEL-NEXT: v_mov_b32_e32 v10, s11 -; GISEL-NEXT: v_add_i32_e64 v8, s[10:11], v8, v12 -; GISEL-NEXT: v_mov_b32_e32 v12, s12 -; GISEL-NEXT: v_add_i32_e64 v11, s[10:11], v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 -; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v16, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v16, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v15, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 -; GISEL-NEXT: v_mov_b32_e32 v19, s13 -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v17 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v14, v13 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v16, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v14, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v11, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v8 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v4, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v6, s[6:7], 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 +; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v6, vcc, 0, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v15, v9, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v2, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v19, v14, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -9249,8 +9249,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s2, 0xfee0 -; GFX6-NEXT: s_mov_b32 s3, 0x68958c89 +; GFX6-NEXT: s_movk_i32 s4, 0xfee0 +; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 ; GFX6-NEXT: v_mov_b32_e32 v8, 0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -9259,15 +9259,14 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mov_b32_e32 v7, 0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 +; GFX6-NEXT: s_movk_i32 s8, 0x11f +; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 @@ -9282,70 +9281,69 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 -; GFX6-NEXT: s_movk_i32 s2, 0x11f -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX6-NEXT: s_mov_b32 s3, 0x976a7377 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX6-NEXT: s_mov_b32 s4, 0x976a7376 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v5, s2 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v5, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s3, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s9, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: s_movk_i32 s3, 0x11e -; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: s_movk_i32 s2, 0x11e +; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v4 +; GFX6-NEXT: s_mov_b32 s9, 0x976a7376 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v5 +; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s9, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -9353,19 +9351,19 @@ ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NEXT: v_mov_b32_e32 v6, s3 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i64_oddk_denom: @@ -9374,8 +9372,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xfee0 -; GFX9-NEXT: s_mov_b32 s5, 0x68958c89 +; GFX9-NEXT: s_movk_i32 s2, 0xfee0 +; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -9384,10 +9382,11 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, s5 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 @@ -9404,33 +9403,32 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, s5 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v9 -; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 +; GFX9-NEXT: s_movk_i32 s2, 0x11f +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 +; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 @@ -9441,8 +9439,6 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: s_movk_i32 s2, 0x11f -; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc @@ -9493,22 +9489,22 @@ ; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 -; GFX90A-NEXT: s_mov_b32 s3, 0x68958c89 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b32 s0, 0x68958c89 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 -; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s0 ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 @@ -9523,32 +9519,30 @@ ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 -; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 -; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s3 -; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 -; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 -; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc -; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 -; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v2, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, s0 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 @@ -9783,7 +9777,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s2, 0xf001 +; GFX6-NEXT: s_movk_i32 s6, 0xf001 ; GFX6-NEXT: v_mov_b32_e32 v8, 0 ; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9793,12 +9787,13 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s2 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -9815,43 +9810,40 @@ ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_hi_u32 v4, v0, s2 -; GFX6-NEXT: v_mul_lo_u32 v5, v2, s2 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, s2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v6 -; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GFX6-NEXT: v_mul_lo_u32 v10, v2, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s11, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 +; GFX6-NEXT: s_movk_i32 s0, 0xfff +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_movk_i32 s0, 0xfff +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc @@ -9865,8 +9857,8 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc @@ -9885,8 +9877,8 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9895,7 +9887,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xf001 +; GFX9-NEXT: s_movk_i32 s2, 0xf001 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9904,10 +9896,12 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s8, 0xfff -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s4 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 @@ -9924,34 +9918,32 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_hi_u32 v4, v0, s4 -; GFX9-NEXT: v_mul_lo_u32 v6, v2, s4 -; GFX9-NEXT: v_mul_lo_u32 v8, v0, s4 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v8 -; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v8 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: s_movk_i32 s4, 0xffe +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 @@ -9961,8 +9953,6 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 -; GFX9-NEXT: s_movk_i32 s4, 0xffe ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc @@ -10002,7 +9992,8 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0xf001 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -10011,13 +10002,14 @@ ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s8 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 +; GFX90A-NEXT: s_movk_i32 s4, 0xf001 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s8 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 ; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 @@ -10032,32 +10024,29 @@ ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 -; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s8 -; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 -; GFX90A-NEXT: v_sub_u32_e32 v6, v6, v0 -; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s8 -; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 -; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc -; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v9 -; GFX90A-NEXT: v_mul_lo_u32 v9, v2, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 ; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 @@ -10072,18 +10061,18 @@ ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0xfff -; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s0 -; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s0 +; GFX90A-NEXT: s_movk_i32 s4, 0xfff +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 ; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s0 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s4 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 ; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 ; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc -; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s0, v3 +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v3 ; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0xffe -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GFX90A-NEXT: s_movk_i32 s4, 0xffe +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc @@ -10091,16 +10080,15 @@ ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc -; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm %r = udiv <2 x i64> %x, @@ -10227,56 +10215,54 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: s_movk_i32 s4, 0x11f -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX6-NEXT: s_mov_b32 s9, s5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX6-NEXT: s_movk_i32 s5, 0x11e -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: s_movk_i32 s5, 0x11e +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 @@ -10318,8 +10304,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xfee0 -; GFX9-NEXT: s_mov_b32 s5, 0x689e0837 +; GFX9-NEXT: s_movk_i32 s2, 0xfee0 +; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -10328,12 +10314,12 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 ; GFX9-NEXT: s_movk_i32 s8, 0x11f -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, s5 -; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 @@ -10345,39 +10331,37 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 ; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 +; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, s5 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v9 -; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 @@ -10439,22 +10423,22 @@ ; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 -; GFX90A-NEXT: s_mov_b32 s3, 0x689e0837 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b32 s0, 0x689e0837 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 -; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s0 ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 @@ -10469,32 +10453,30 @@ ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 -; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 -; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s3 -; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 -; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 -; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc -; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 -; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v2, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, s0 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 @@ -10824,7 +10806,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 +; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 ; GFX6-NEXT: v_mov_b32_e32 v8, 0 ; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -10833,14 +10815,14 @@ ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -10857,69 +10839,66 @@ ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 -; GFX6-NEXT: s_mov_b32 s5, s9 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: s_ashr_i32 s2, s11, 31 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s10, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s1, s11, s2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 +; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s3, v8 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa ; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc ; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 @@ -10931,10 +10910,11 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -10944,7 +10924,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 +; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -10953,13 +10933,10 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s7, 31 -; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -10976,34 +10953,35 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 -; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] -; GFX9-NEXT: s_add_u32 s2, s6, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_addc_u32 s3, s7, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 @@ -11013,23 +10991,23 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX9-NEXT: s_mov_b32 s1, 0x12d8fb +; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s1 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s1 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s1, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v4 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc -; GFX9-NEXT: s_mov_b32 s1, 0x12d8fa -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 +; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc @@ -11037,19 +11015,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_i64_oddk_denom: @@ -11057,7 +11035,7 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 -; GFX90A-NEXT: s_mov_b32 s2, 0xffed2705 +; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -11066,12 +11044,12 @@ ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 @@ -11086,37 +11064,35 @@ ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 -; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 -; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 -; GFX90A-NEXT: v_mul_lo_u32 v7, v0, s2 -; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v7 -; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v7 -; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v7 -; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, v8, v11, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 -; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 +; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v8, v9, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v6, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 -; GFX90A-NEXT: s_add_u32 s2, s6, s0 +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: s_add_u32 s2, s2, s4 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX90A-NEXT: s_mov_b32 s1, s0 -; GFX90A-NEXT: s_addc_u32 s3, s7, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_addc_u32 s3, s3, s4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 ; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 @@ -11131,18 +11107,18 @@ ; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc -; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb -; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s1 +; GFX90A-NEXT: s_mov_b32 s5, 0x12d8fb +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s5 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s5 ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v4 +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v4 ; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc -; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fa -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v5 +; GFX90A-NEXT: s_mov_b32 s2, 0x12d8fa +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc @@ -11150,19 +11126,19 @@ ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v4 +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v4 ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, -1, v4, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s0 -; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s4 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 ; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -11236,35 +11212,38 @@ ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 -; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_add_u32 s4, s4, s2 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s5, s5, s2 -; GFX6-NEXT: s_xor_b64 s[12:13], s[4:5], s[2:3] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s4, 0, s12 -; GFX6-NEXT: s_subb_u32 s5, 0, s13 -; GFX6-NEXT: s_ashr_i32 s14, s11, 31 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX6-NEXT: s_sub_u32 s4, 0, s10 +; GFX6-NEXT: s_subb_u32 s5, 0, s11 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s12, s3, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_add_u32 s2, s2, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX6-NEXT: s_addc_u32 s3, s3, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -11276,6 +11255,7 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 @@ -11283,68 +11263,63 @@ ; GFX6-NEXT: v_mov_b32_e32 v6, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, s5, v0 -; GFX6-NEXT: s_mov_b32 s5, s9 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v7, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s10, s14 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v7, s3, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -11352,18 +11327,18 @@ ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s11 +; GFX6-NEXT: v_mov_b32_e32 v6, s3 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[2:3] +; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -11379,16 +11354,16 @@ ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 -; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s3, s3, s8 -; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_u32 s12, 0, s10 -; GFX9-NEXT: s_subb_u32 s4, 0, s11 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s5, s5, s2 +; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_sub_u32 s10, 0, s8 +; GFX9-NEXT: s_subb_u32 s4, 0, s9 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -11397,10 +11372,10 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s12, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 @@ -11418,39 +11393,37 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v5, s12, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, s12, v0 -; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v9, s12, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v7, s10, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 @@ -11465,39 +11438,39 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s11, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 ; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v4 +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 ; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -11521,22 +11494,25 @@ ; GFX90A-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX90A-NEXT: s_sub_u32 s10, 0, s8 -; GFX90A-NEXT: s_subb_u32 s11, 0, s9 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_sub_u32 s0, 0, s8 +; GFX90A-NEXT: s_subb_u32 s1, 0, s9 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s10, v0 -; GFX90A-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 -; GFX90A-NEXT: v_mul_lo_u32 v6, s10, v0 +; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 @@ -11553,37 +11529,32 @@ ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v5, s10, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v0 -; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX90A-NEXT: v_mul_lo_u32 v7, s11, v0 -; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v0 -; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 -; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v2, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc ; GFX90A-NEXT: s_add_u32 s0, s6, s10 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 @@ -11750,7 +11721,7 @@ ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s6, 0xf001 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -11759,13 +11730,13 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s0, s9, 31 -; GFX6-NEXT: s_lshr_b32 s0, s0, 20 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 +; GFX6-NEXT: s_lshr_b32 s8, s8, 20 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 -; GFX6-NEXT: s_add_u32 s2, s8, s0 -; GFX6-NEXT: s_addc_u32 s3, s9, 0 -; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_ashr_i64 s[8:9], s[0:1], 12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 @@ -11778,8 +11749,8 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: s_ashr_i32 s8, s11, 31 -; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_ashr_i32 s10, s3, 31 +; GFX6-NEXT: s_add_u32 s0, s2, s10 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 @@ -11787,34 +11758,32 @@ ; GFX6-NEXT: v_mov_b32_e32 v6, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v5, v2, s6 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, s6 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, s6 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 -; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s10, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s11, s8 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s6 +; GFX6-NEXT: s_mov_b32 s11, s10 +; GFX6-NEXT: s_addc_u32 s1, s3, s10 +; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 @@ -11824,17 +11793,17 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: s_movk_i32 s9, 0xfff +; GFX6-NEXT: s_movk_i32 s2, 0xfff ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s9 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, v0, s9 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, s2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc @@ -11842,7 +11811,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v8 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s2, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc ; GFX6-NEXT: s_movk_i32 s0, 0xffe ; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 @@ -11859,13 +11828,13 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11876,8 +11845,8 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s8, 0xf001 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -11890,8 +11859,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 -; GFX9-NEXT: s_add_u32 s4, s4, s2 -; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 @@ -11903,44 +11872,41 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v5, v2, s8 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0 -; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 -; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s6, s6, s2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s7, s7, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 +; GFX9-NEXT: s_add_u32 s6, s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: s_addc_u32 s7, s7, s4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 @@ -11950,23 +11916,24 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: s_movk_i32 s3, 0xfff +; GFX9-NEXT: s_movk_i32 s5, 0xfff +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s3 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s3, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v5 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc -; GFX9-NEXT: s_movk_i32 s3, 0xffe -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 +; GFX9-NEXT: s_movk_i32 s5, 0xffe +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc @@ -11974,20 +11941,20 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v5 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s5, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm @@ -11998,7 +11965,6 @@ ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0xf001 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 @@ -12011,11 +11977,13 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 ; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 -; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s8 -; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s8 +; GFX90A-NEXT: s_add_u32 s0, s4, s0 +; GFX90A-NEXT: s_movk_i32 s4, 0xf001 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s4 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 ; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 @@ -12026,44 +11994,41 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: s_add_u32 s0, s4, s0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX90A-NEXT: s_addc_u32 s1, s5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX90A-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 -; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 -; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s8 -; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 -; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s8 -; GFX90A-NEXT: v_mul_hi_u32 v9, v2, v8 -; GFX90A-NEXT: v_mul_lo_u32 v10, v2, v8 -; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 -; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] -; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 -; GFX90A-NEXT: s_add_u32 s6, s6, s0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-NEXT: s_mov_b32 s1, s0 -; GFX90A-NEXT: s_addc_u32 s7, s7, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc +; GFX90A-NEXT: s_addc_u32 s1, s5, 0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc +; GFX90A-NEXT: s_add_u32 s6, s6, s4 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_addc_u32 s7, s7, s4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 ; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 @@ -12078,18 +12043,18 @@ ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc -; GFX90A-NEXT: s_movk_i32 s1, 0xfff -; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s1 -; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 +; GFX90A-NEXT: s_movk_i32 s5, 0xfff +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s5 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s1 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 ; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 ; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc -; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v3 +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v3 ; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc -; GFX90A-NEXT: s_movk_i32 s1, 0xffe -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v5 +; GFX90A-NEXT: s_movk_i32 s5, 0xffe +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc @@ -12097,20 +12062,20 @@ ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s0 -; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 +; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s4 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm %r = sdiv <2 x i64> %x, @@ -12140,42 +12105,49 @@ ; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc ; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 -; GFX6-NEXT: s_ashr_i32 s16, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s16 -; GFX6-NEXT: s_mov_b32 s17, s16 -; GFX6-NEXT: s_addc_u32 s3, s3, s16 -; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX6-NEXT: s_ashr_i32 s12, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s3, s3, s12 +; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 -; GFX6-NEXT: s_sub_u32 s6, 0, s14 -; GFX6-NEXT: s_subb_u32 s7, 0, s15 +; GFX6-NEXT: s_sub_u32 s6, 0, s10 +; GFX6-NEXT: s_subb_u32 s7, 0, s11 ; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s14, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s14 ; GFX6-NEXT: v_mul_lo_u32 v0, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v3 ; GFX6-NEXT: v_mul_lo_u32 v4, s7, v3 ; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 +; GFX6-NEXT: s_mov_b32 s15, s14 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v0, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v1 +; GFX6-NEXT: s_addc_u32 s1, s1, s14 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 +; GFX6-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] +; GFX6-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v2, v1 @@ -12184,92 +12156,82 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e64 v3, s[2:3], v3, v4 -; GFX6-NEXT: v_addc_u32_e64 v4, vcc, v2, v5, s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v6, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GFX6-NEXT: v_mul_lo_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v7 -; GFX6-NEXT: v_mul_hi_u32 v12, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v4, v6 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v2, v6, s[2:3] -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s2, s9, 31 -; GFX6-NEXT: s_add_u32 s0, s8, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s1, s9, s2 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v4, s8, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 -; GFX6-NEXT: v_mul_hi_u32 v6, s8, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s9, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, s9, v2 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s6, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, s6, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, s7, v3 +; GFX6-NEXT: s_ashr_i32 s12, s9, 31 +; GFX6-NEXT: s_add_u32 s8, s8, s12 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s16, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, s16, v3 +; GFX6-NEXT: v_mul_hi_u32 v6, s16, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s17, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s17, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s9, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[16:17] -; GFX6-NEXT: s_ashr_i32 s2, s13, 31 +; GFX6-NEXT: v_mul_lo_u32 v6, s17, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, s17, v3 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s9, s9, s12 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s14, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s14, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s15, v2 -; GFX6-NEXT: v_mov_b32_e32 v7, s15 -; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s11, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s11 +; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s14, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s10, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s9, v4 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s8, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s17, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s16, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s10, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: s_add_u32 s8, s12, s2 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: s_addc_u32 s9, s13, s2 -; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v8, s17 ; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v11, s9 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 ; GFX6-NEXT: v_mac_f32_e32 v10, s18, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_rcp_f32_e32 v5, v10 @@ -12282,15 +12244,15 @@ ; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX6-NEXT: s_sub_u32 s12, 0, s8 +; GFX6-NEXT: s_sub_u32 s0, 0, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v5 -; GFX6-NEXT: v_mul_lo_u32 v7, s12, v6 -; GFX6-NEXT: s_subb_u32 s13, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v8, s13, v5 -; GFX6-NEXT: v_xor_b32_e32 v2, s16, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s0, v5 +; GFX6-NEXT: v_mul_lo_u32 v7, s0, v6 +; GFX6-NEXT: s_subb_u32 s1, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v8, s1, v5 +; GFX6-NEXT: s_ashr_i32 s10, s3, 31 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, s12, v5 +; GFX6-NEXT: v_mul_lo_u32 v7, s0, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 @@ -12301,54 +12263,53 @@ ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GFX6-NEXT: v_mul_lo_u32 v10, v6, v7 ; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX6-NEXT: v_xor_b32_e32 v3, s17, v3 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s11, s10 +; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc -; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], v5, v4 -; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v6, v7, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v8, s12, v5 -; GFX6-NEXT: v_mul_hi_u32 v9, s12, v4 -; GFX6-NEXT: v_mul_lo_u32 v10, s13, v4 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GFX6-NEXT: v_mul_lo_u32 v9, s12, v4 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GFX6-NEXT: v_mul_lo_u32 v12, v4, v8 -; GFX6-NEXT: v_mul_hi_u32 v13, v4, v9 -; GFX6-NEXT: v_mul_hi_u32 v14, v4, v8 -; GFX6-NEXT: v_mul_hi_u32 v11, v5, v9 -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v9 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v8 -; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, v8 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GFX6-NEXT: s_ashr_i32 s12, s11, 31 -; GFX6-NEXT: v_addc_u32_e64 v6, vcc, v6, v8, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s10, s12 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s1, s11, s12 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v6, s10, v5 -; GFX6-NEXT: v_mul_hi_u32 v7, s10, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, s10, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, s11, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX6-NEXT: v_mul_hi_u32 v7, s0, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, s1, v4 +; GFX6-NEXT: v_xor_b32_e32 v3, s15, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_mul_lo_u32 v7, s0, v4 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GFX6-NEXT: v_mul_lo_u32 v10, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v11, v4, v7 +; GFX6-NEXT: v_mul_hi_u32 v12, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, v5, v7 +; GFX6-NEXT: v_mul_hi_u32 v8, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc +; GFX6-NEXT: s_add_u32 s0, s2, s10 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: s_addc_u32 s1, s3, s10 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[10:11] +; GFX6-NEXT: v_mul_lo_u32 v6, s2, v5 +; GFX6-NEXT: v_mul_hi_u32 v7, s2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, s2, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, s3, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s3, v5 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, s11, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, s11, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, s17 +; GFX6-NEXT: v_mul_lo_u32 v9, s3, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, s3, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, s15 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc @@ -12356,15 +12317,15 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, s8, v5 ; GFX6-NEXT: v_mul_hi_u32 v7, s8, v4 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s16, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, s9, v4 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s8, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s3, v2 ; GFX6-NEXT: v_mov_b32_e32 v7, s9 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc ; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s8, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] @@ -12380,7 +12341,7 @@ ; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v8, s11 +; GFX6-NEXT: v_mov_b32_e32 v8, s3 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc @@ -12391,7 +12352,7 @@ ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] +; GFX6-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 @@ -12419,20 +12380,24 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 -; GFX9-NEXT: s_sub_u32 s14, 0, s10 -; GFX9-NEXT: s_subb_u32 s4, 0, s11 +; GFX9-NEXT: s_sub_u32 s2, 0, s10 +; GFX9-NEXT: s_subb_u32 s3, 0, s11 ; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s14, v2 -; GFX9-NEXT: v_mul_hi_u32 v1, s14, v3 -; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s14, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s14, s5, 31 +; GFX9-NEXT: s_mov_b32 s15, s14 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 +; GFX9-NEXT: v_mul_hi_u32 v1, s2, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 @@ -12451,38 +12416,32 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc -; GFX9-NEXT: v_add_co_u32_e64 v3, s[2:3], v3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, s14, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 -; GFX9-NEXT: v_mul_lo_u32 v9, s14, v3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v11, v3, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v4, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v4, v9 -; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s14, s5, 31 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, s2, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 ; GFX9-NEXT: s_add_u32 s2, s4, s14 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v3, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v2, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v2, v7 +; GFX9-NEXT: v_mul_hi_u32 v5, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: s_mov_b32 s15, s14 ; GFX9-NEXT: s_addc_u32 s3, s5, s14 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] ; GFX9-NEXT: v_mul_lo_u32 v4, s4, v2 ; GFX9-NEXT: v_mul_hi_u32 v5, s4, v3 @@ -12494,7 +12453,6 @@ ; GFX9-NEXT: v_mul_lo_u32 v6, s5, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v0, vcc @@ -12522,12 +12480,13 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v2, v6 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s0, s8, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s4 ; GFX9-NEXT: v_mov_b32_e32 v8, s5 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s9, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX9-NEXT: s_addc_u32 s9, s9, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v8, v4, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s9 @@ -12565,44 +12524,42 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc ; GFX9-NEXT: v_mul_lo_u32 v10, v5, v7 ; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s13, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc -; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v4, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s10, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, s10, v4 -; GFX9-NEXT: v_mul_lo_u32 v10, s11, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, s10, v4 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v13, v4, v11 -; GFX9-NEXT: v_mul_hi_u32 v14, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v10, v6, v11 -; GFX9-NEXT: v_mul_lo_u32 v11, v6, v11 -; GFX9-NEXT: v_mul_hi_u32 v9, v6, v8 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v6, v8 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s10, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s10, v4 +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v4 ; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_mul_lo_u32 v10, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v11, v4, v9 +; GFX9-NEXT: v_mul_hi_u32 v12, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v5, v9 +; GFX9-NEXT: v_mul_lo_u32 v9, v5, v9 +; GFX9-NEXT: v_mul_hi_u32 v7, v5, v6 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v7, vcc +; GFX9-NEXT: s_add_u32 s6, s6, s10 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: s_addc_u32 s7, s7, s10 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, s6, v5 @@ -12612,7 +12569,7 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc ; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, s7, v4 -; GFX9-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v0, vcc @@ -12621,7 +12578,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v6, s8, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, s8, v4 ; GFX9-NEXT: v_mul_lo_u32 v9, s9, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s12, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v7, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, s8, v4 @@ -12678,216 +12635,212 @@ ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s3, s3, s10 ; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s12 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s13 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 -; GFX90A-NEXT: s_sub_u32 s14, 0, s12 -; GFX90A-NEXT: s_subb_u32 s15, 0, s13 -; GFX90A-NEXT: v_mac_f32_e32 v1, s16, v2 -; GFX90A-NEXT: v_rcp_f32_e32 v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v1 -; GFX90A-NEXT: v_mul_f32_e32 v2, s18, v1 -; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 -; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v2 +; GFX90A-NEXT: s_sub_u32 s0, 0, s12 +; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_subb_u32 s1, 0, s13 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX90A-NEXT: v_mul_hi_u32 v4, s14, v1 -; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v2 -; GFX90A-NEXT: v_mul_lo_u32 v3, s15, v1 -; GFX90A-NEXT: v_add_u32_e32 v4, v4, v5 -; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v1 -; GFX90A-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v6 -; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX90A-NEXT: s_mov_b32 s15, s14 +; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX90A-NEXT: v_mul_lo_u32 v6, v2, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, s14, v1 -; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX90A-NEXT: v_mul_lo_u32 v7, s15, v1 -; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX90A-NEXT: v_mul_lo_u32 v8, s14, v1 -; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v12, v1, v5 -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v8 -; GFX90A-NEXT: v_mul_hi_u32 v11, v1, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 -; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc ; GFX90A-NEXT: s_add_u32 s0, s4, s14 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX90A-NEXT: s_mov_b32 s15, s14 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX90A-NEXT: s_addc_u32 s1, s5, s14 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] -; GFX90A-NEXT: v_mul_lo_u32 v4, s4, v2 -; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v7, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v3, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, s12, v2 -; GFX90A-NEXT: v_mul_hi_u32 v4, s12, v1 -; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX90A-NEXT: v_mul_lo_u32 v4, s13, v1 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX90A-NEXT: v_mul_lo_u32 v5, s12, v1 -; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s12, v0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, s13 ; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 -; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v7, vcc ; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s12, v5 -; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v8, v7, s[0:1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v8, v7, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, 1, 2, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v8, s5 -; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v1, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1] -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 +; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v0, v3 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v8, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX90A-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] ; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GFX90A-NEXT: s_add_u32 s8, s8, s4 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc ; GFX90A-NEXT: s_mov_b32 s5, s4 ; GFX90A-NEXT: s_addc_u32 s9, s9, s4 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX90A-NEXT: v_xor_b32_e32 v5, s1, v2 -; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v4 -; GFX90A-NEXT: v_rcp_f32_e32 v4, v3 -; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, s1 -; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v4 -; GFX90A-NEXT: v_mul_f32_e32 v4, s18, v1 -; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 -; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v4 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX90A-NEXT: s_sub_u32 s10, 0, s8 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc -; GFX90A-NEXT: s_subb_u32 s11, 0, s9 -; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v1 -; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v4 -; GFX90A-NEXT: v_mul_lo_u32 v5, s11, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 +; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 +; GFX90A-NEXT: s_sub_u32 s0, 0, s8 +; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s1 +; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: s_subb_u32 s1, 0, s9 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s1, v2 ; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX90A-NEXT: v_mul_lo_u32 v9, s10, v1 -; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 -; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v9 -; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, s0, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: v_mul_hi_u32 v11, v4, v9 -; GFX90A-NEXT: v_mul_lo_u32 v9, v4, v9 +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX90A-NEXT: v_mul_hi_u32 v10, v4, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, v4, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v5 -; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v5 -; GFX90A-NEXT: v_mul_hi_u32 v9, s10, v1 -; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX90A-NEXT: v_mul_lo_u32 v9, s11, v1 -; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 -; GFX90A-NEXT: v_mul_lo_u32 v10, s10, v1 -; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v10 -; GFX90A-NEXT: v_mul_lo_u32 v12, v5, v10 -; GFX90A-NEXT: v_mul_lo_u32 v14, v1, v8 -; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v10 -; GFX90A-NEXT: v_mul_hi_u32 v13, v1, v8 -; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 -; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 -; GFX90A-NEXT: v_mul_hi_u32 v9, v5, v8 -; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, v5, v8 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc -; GFX90A-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s1, v2 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v12, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 ; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 -; GFX90A-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v7, vcc ; GFX90A-NEXT: s_add_u32 s0, s6, s10 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v4 -; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v4 +; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v1 -; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 -; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v4 -; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v1 +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 +; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v3 +; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v2 ; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v1 +; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v2 ; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v1 +; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v2 ; GFX90A-NEXT: v_sub_u32_e32 v6, s7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, s9 ; GFX90A-NEXT: v_sub_co_u32_e32 v7, vcc, s6, v7 @@ -12909,19 +12862,19 @@ ; GFX90A-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v5 -; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v1, v6 +; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v2, v6 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v4, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX90A-NEXT: v_xor_b32_e32 v5, s1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v6, s1 -; GFX90A-NEXT: v_subrev_co_u32_e32 v4, vcc, s0, v1 -; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc -; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, s1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, s1 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y @@ -12940,7 +12893,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 +; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 ; GFX6-NEXT: v_mov_b32_e32 v8, 0 ; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -12949,14 +12902,14 @@ ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -12973,62 +12926,60 @@ ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 -; GFX6-NEXT: s_mov_b32 s5, s9 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: s_ashr_i32 s2, s11, 31 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s10, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s1, s11, s2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v2 ; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa ; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 @@ -13045,10 +12996,10 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -13058,7 +13009,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 +; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -13067,13 +13018,10 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s7, 31 -; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -13090,34 +13038,35 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 -; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] -; GFX9-NEXT: s_add_u32 s2, s6, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_addc_u32 s3, s7, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 @@ -13127,44 +13076,44 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX9-NEXT: s_mov_b32 s1, 0x12d8fb +; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s1, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s5, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s1, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s5, v2 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc -; GFX9-NEXT: s_mov_b32 s1, 0x12d8fa -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v2 +; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_i64_oddk_denom: @@ -13172,7 +13121,7 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 -; GFX90A-NEXT: s_mov_b32 s2, 0xffed2705 +; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -13181,12 +13130,12 @@ ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 -; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 @@ -13201,37 +13150,35 @@ ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 -; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 -; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 -; GFX90A-NEXT: v_mul_lo_u32 v7, v0, s2 -; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v7 -; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v7 -; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v7 -; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, v8, v11, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 -; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 +; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v8, v9, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v6, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 -; GFX90A-NEXT: s_add_u32 s2, s6, s0 +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: s_add_u32 s2, s2, s4 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX90A-NEXT: s_mov_b32 s1, s0 -; GFX90A-NEXT: s_addc_u32 s3, s7, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_addc_u32 s3, s3, s4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 ; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 @@ -13246,39 +13193,39 @@ ; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc -; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 -; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX90A-NEXT: s_mov_b32 s5, 0x12d8fb +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s5 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX90A-NEXT: v_subrev_co_u32_e32 v3, vcc, s1, v0 +; GFX90A-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v0 ; GFX90A-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc -; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v3 +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v3 ; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc -; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fa -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 +; GFX90A-NEXT: s_mov_b32 s2, 0x12d8fa +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX90A-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0 +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s0 -; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s4 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 ; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -13358,7 +13305,6 @@ ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -13368,27 +13314,29 @@ ; GFX6-NEXT: s_add_u32 s2, s2, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s3, s4 -; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s2, 0, s12 -; GFX6-NEXT: s_subb_u32 s3, 0, s13 -; GFX6-NEXT: s_ashr_i32 s14, s11, 31 +; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: s_sub_u32 s4, 0, s8 +; GFX6-NEXT: s_subb_u32 s5, 0, s9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s10, s3, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX6-NEXT: s_add_u32 s2, s2, s10 +; GFX6-NEXT: s_mov_b32 s11, s10 +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX6-NEXT: s_addc_u32 s3, s3, s10 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -13400,6 +13348,7 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 @@ -13407,88 +13356,85 @@ ; GFX6-NEXT: v_mov_b32_e32 v6, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, s3, v0 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s10, s14 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v7, s13, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s13, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s13, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 +; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v5, s11 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -13507,20 +13453,26 @@ ; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_u32 s10, 0, s8 -; GFX9-NEXT: s_subb_u32 s4, 0, s9 +; GFX9-NEXT: s_sub_u32 s2, 0, s8 +; GFX9-NEXT: s_subb_u32 s3, 0, s9 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 +; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 @@ -13538,39 +13490,31 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v5, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v9, s10, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] -; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v7, s2, v0 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 @@ -13642,25 +13586,25 @@ ; GFX90A-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX90A-NEXT: s_sub_u32 s2, 0, s8 -; GFX90A-NEXT: s_subb_u32 s3, 0, s9 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_sub_u32 s0, 0, s8 +; GFX90A-NEXT: s_subb_u32 s1, 0, s9 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: s_mov_b32 s11, s10 -; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 -; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 @@ -13677,34 +13621,32 @@ ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 -; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v0 -; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v0 -; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 -; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v2, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v4, s0, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc ; GFX90A-NEXT: s_add_u32 s0, s6, s10 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 @@ -13900,12 +13842,14 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 ; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 -; GFX6-NEXT: s_sub_u32 s6, 0, s16 -; GFX6-NEXT: s_subb_u32 s7, 0, s17 +; GFX6-NEXT: s_sub_u32 s2, 0, s16 +; GFX6-NEXT: s_subb_u32 s3, 0, s17 ; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -13915,10 +13859,10 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s0, s8, s12 -; GFX6-NEXT: v_mul_lo_u32 v0, s6, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s6, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v3 -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s2, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 ; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 @@ -13940,33 +13884,30 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e64 v3, s[2:3], v3, v4 -; GFX6-NEXT: v_addc_u32_e64 v4, vcc, v2, v5, s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v6, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GFX6-NEXT: v_mul_lo_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v7 -; GFX6-NEXT: v_mul_hi_u32 v12, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v4, v6 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v2, v6, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s8, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, s8, v2 @@ -13976,7 +13917,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, s9, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc @@ -14031,15 +13971,15 @@ ; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX6-NEXT: s_sub_u32 s2, 0, s8 +; GFX6-NEXT: s_sub_u32 s0, 0, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v5 -; GFX6-NEXT: v_mul_lo_u32 v7, s2, v6 -; GFX6-NEXT: s_subb_u32 s3, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v8, s3, v5 +; GFX6-NEXT: v_mul_hi_u32 v4, s0, v5 +; GFX6-NEXT: v_mul_lo_u32 v7, s0, v6 +; GFX6-NEXT: s_subb_u32 s1, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v8, s1, v5 ; GFX6-NEXT: s_ashr_i32 s14, s11, 31 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, s2, v5 +; GFX6-NEXT: v_mul_lo_u32 v7, s0, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 @@ -14057,35 +13997,33 @@ ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc -; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], v5, v4 -; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v6, v7, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v8, s2, v5 -; GFX6-NEXT: v_mul_hi_u32 v9, s2, v4 -; GFX6-NEXT: v_mul_lo_u32 v10, s3, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX6-NEXT: v_mul_hi_u32 v7, s0, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, s1, v4 ; GFX6-NEXT: v_xor_b32_e32 v3, s12, v3 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GFX6-NEXT: v_mul_lo_u32 v9, s2, v4 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GFX6-NEXT: v_mul_lo_u32 v12, v4, v8 -; GFX6-NEXT: v_mul_hi_u32 v13, v4, v9 -; GFX6-NEXT: v_mul_hi_u32 v14, v4, v8 -; GFX6-NEXT: v_mul_hi_u32 v11, v5, v9 -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v9 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v8 -; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, v8 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GFX6-NEXT: v_addc_u32_e64 v6, vcc, v6, v8, s[0:1] +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_mul_lo_u32 v7, s0, v4 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GFX6-NEXT: v_mul_lo_u32 v10, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v11, v4, v7 +; GFX6-NEXT: v_mul_hi_u32 v12, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, v5, v7 +; GFX6-NEXT: v_mul_hi_u32 v8, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc ; GFX6-NEXT: s_add_u32 s0, s10, s14 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GFX6-NEXT: v_mul_lo_u32 v6, s10, v5 ; GFX6-NEXT: v_mul_hi_u32 v7, s10, v4 @@ -14165,20 +14103,24 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 -; GFX9-NEXT: s_sub_u32 s8, 0, s12 -; GFX9-NEXT: s_subb_u32 s4, 0, s13 +; GFX9-NEXT: s_sub_u32 s2, 0, s12 +; GFX9-NEXT: s_subb_u32 s3, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v1, s8, v3 -; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 +; GFX9-NEXT: v_mul_hi_u32 v1, s2, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 @@ -14197,38 +14139,32 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc -; GFX9-NEXT: v_add_co_u32_e64 v3, s[2:3], v3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, s8, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 -; GFX9-NEXT: v_mul_lo_u32 v9, s8, v3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v11, v3, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v4, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v4, v9 -; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, s2, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 ; GFX9-NEXT: s_add_u32 s2, s4, s8 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v3, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v2, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v2, v7 +; GFX9-NEXT: v_mul_hi_u32 v5, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_addc_u32 s3, s5, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] ; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 ; GFX9-NEXT: v_mul_hi_u32 v5, s14, v3 @@ -14294,13 +14230,13 @@ ; GFX9-NEXT: v_mac_f32_e32 v5, s19, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: s_sub_u32 s2, 0, s10 -; GFX9-NEXT: s_subb_u32 s3, 0, s11 -; GFX9-NEXT: v_mul_hi_u32 v7, s2, v5 -; GFX9-NEXT: v_mul_lo_u32 v8, s2, v6 -; GFX9-NEXT: v_mul_lo_u32 v9, s3, v5 +; GFX9-NEXT: s_sub_u32 s0, 0, s10 +; GFX9-NEXT: s_subb_u32 s1, 0, s11 +; GFX9-NEXT: v_mul_hi_u32 v7, s0, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, s0, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, s1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v4, s0, v5 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 ; GFX9-NEXT: v_mul_lo_u32 v8, v5, v7 @@ -14319,34 +14255,32 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc -; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v6, v7, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s2, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, s2, v4 -; GFX9-NEXT: v_mul_lo_u32 v10, s3, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, s2, v4 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v13, v4, v11 -; GFX9-NEXT: v_mul_hi_u32 v14, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v10, v5, v11 -; GFX9-NEXT: v_mul_lo_u32 v11, v5, v11 -; GFX9-NEXT: v_mul_hi_u32 v9, v5, v8 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, v8 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s0, v4 +; GFX9-NEXT: v_mul_lo_u32 v8, s1, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s0, v4 ; GFX9-NEXT: s_add_u32 s0, s6, s12 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_mul_lo_u32 v10, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v11, v4, v9 +; GFX9-NEXT: v_mul_hi_u32 v12, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v5, v9 +; GFX9-NEXT: v_mul_lo_u32 v9, v5, v9 +; GFX9-NEXT: v_mul_hi_u32 v7, v5, v6 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: s_addc_u32 s1, s7, s12 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] ; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 @@ -14426,222 +14360,218 @@ ; GFX90A-NEXT: s_mov_b32 s5, s4 ; GFX90A-NEXT: s_addc_u32 s3, s3, s4 ; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s12 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s13 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 -; GFX90A-NEXT: s_sub_u32 s2, 0, s12 -; GFX90A-NEXT: s_subb_u32 s3, 0, s13 -; GFX90A-NEXT: v_mac_f32_e32 v1, s16, v2 -; GFX90A-NEXT: v_rcp_f32_e32 v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v1 -; GFX90A-NEXT: v_mul_f32_e32 v2, s18, v1 -; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 -; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v2 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: s_sub_u32 s0, 0, s12 +; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_subb_u32 s1, 0, s13 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: s_mov_b32 s15, s14 -; GFX90A-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX90A-NEXT: v_add_u32_e32 v4, v4, v5 -; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v1 -; GFX90A-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v6 -; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX90A-NEXT: v_mul_lo_u32 v6, v2, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v1 -; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v1 -; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v1 -; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v12, v1, v5 -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v8 -; GFX90A-NEXT: v_mul_hi_u32 v11, v1, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc ; GFX90A-NEXT: s_add_u32 s0, s4, s14 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX90A-NEXT: s_addc_u32 s1, s5, s14 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] -; GFX90A-NEXT: v_mul_lo_u32 v4, s4, v2 -; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v7, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v3, vcc -; GFX90A-NEXT: v_mul_lo_u32 v2, s12, v2 -; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v1 -; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v1 -; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s12, v1 -; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v4, s13 -; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, s4, v1 -; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc -; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v1 -; GFX90A-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v3, s[0:1] +; GFX90A-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX90A-NEXT: v_mul_lo_u32 v0, s12, v0 +; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s13 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v0 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 -; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] +; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 -; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v5 +; GFX90A-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] -; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX90A-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v5, s5 -; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1] +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 ; GFX90A-NEXT: s_add_u32 s2, s10, s0 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX90A-NEXT: s_mov_b32 s1, s0 ; GFX90A-NEXT: s_addc_u32 s3, s11, s0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX90A-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 +; GFX90A-NEXT: s_sub_u32 s0, 0, s4 ; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 -; GFX90A-NEXT: v_xor_b32_e32 v5, s14, v2 -; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s14, v1 -; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v4 -; GFX90A-NEXT: v_rcp_f32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, s14 -; GFX90A-NEXT: s_sub_u32 s2, 0, s4 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc -; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v4 -; GFX90A-NEXT: v_mul_f32_e32 v4, s18, v1 -; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 -; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v4 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX90A-NEXT: s_subb_u32 s3, 0, s5 -; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 -; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v1 -; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v4 -; GFX90A-NEXT: v_mul_lo_u32 v5, s3, v1 +; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 +; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, s14 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 +; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: s_subb_u32 s1, 0, s5 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s1, v2 ; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX90A-NEXT: v_mul_lo_u32 v9, s2, v1 -; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 -; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v9 -; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, s0, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: v_mul_hi_u32 v11, v4, v9 -; GFX90A-NEXT: v_mul_lo_u32 v9, v4, v9 +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX90A-NEXT: v_mul_hi_u32 v10, v4, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, v4, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v5 -; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v5 -; GFX90A-NEXT: v_mul_hi_u32 v9, s2, v1 -; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX90A-NEXT: v_mul_lo_u32 v9, s3, v1 -; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 -; GFX90A-NEXT: v_mul_lo_u32 v10, s2, v1 -; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v10 -; GFX90A-NEXT: v_mul_lo_u32 v12, v5, v10 -; GFX90A-NEXT: v_mul_lo_u32 v14, v1, v8 -; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v10 -; GFX90A-NEXT: v_mul_hi_u32 v13, v1, v8 -; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 -; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 -; GFX90A-NEXT: v_mul_hi_u32 v9, v5, v8 -; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, v5, v8 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc -; GFX90A-NEXT: v_add_u32_e32 v4, v4, v7 -; GFX90A-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s1, v2 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v12, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v7, vcc ; GFX90A-NEXT: s_add_u32 s0, s6, s10 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v4 -; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v4 +; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v1 -; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 -; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX90A-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc -; GFX90A-NEXT: v_mul_lo_u32 v4, s4, v4 -; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 -; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v1 -; GFX90A-NEXT: v_add_u32_e32 v4, v4, v5 -; GFX90A-NEXT: v_mul_lo_u32 v1, s4, v1 -; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v4 +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 +; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v2 +; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, s5 -; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 ; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v1 +; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v2 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8 ; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] @@ -14655,23 +14585,23 @@ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 ; GFX90A-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v7, s7 -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX90A-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX90A-NEXT: v_xor_b32_e32 v5, s10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v6, s10 -; GFX90A-NEXT: v_subrev_co_u32_e32 v4, vcc, s10, v1 -; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc -; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, s10 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -54,31 +54,29 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc -; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v7, v6 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5 -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 -; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8 -; GFX9-NEXT: v_mul_lo_u32 v13, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v16, v5, v7 -; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v14, vcc +; GFX9-NEXT: v_add3_u32 v8, v10, v9, v8 +; GFX9-NEXT: v_mul_lo_u32 v11, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v12, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v13, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v10, v6, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v15, v13, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v11, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v12, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v8, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 @@ -217,31 +215,29 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v10, vcc -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 -; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 +; GFX9-NEXT: v_add3_u32 v7, v9, v8, v7 ; GFX9-NEXT: v_mul_lo_u32 v10, v4, v7 ; GFX9-NEXT: v_mul_hi_u32 v11, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v15, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v14, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v14, v4, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v5, v7 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v13, v15, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v14, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v7 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v5 @@ -375,31 +371,29 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v13, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v14, v10, vcc -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 -; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v15, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v16, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 -; GFX9-NEXT: v_mul_hi_u32 v10, v8, v7 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v15, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v14, v16, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v13, vcc +; GFX9-NEXT: v_add3_u32 v7, v9, v8, v7 +; GFX9-NEXT: v_mul_lo_u32 v10, v4, v7 +; GFX9-NEXT: v_mul_hi_u32 v11, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v12, v4, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v5, v7 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v14, v12, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v7 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v13, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v6 @@ -534,31 +528,29 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v10, vcc -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 -; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 +; GFX9-NEXT: v_add3_u32 v7, v9, v8, v7 ; GFX9-NEXT: v_mul_lo_u32 v10, v4, v7 ; GFX9-NEXT: v_mul_hi_u32 v11, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v15, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v14, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v14, v4, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v5, v7 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v13, v15, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v14, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v7 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v5 @@ -815,31 +807,29 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc -; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v7, v6 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5 -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 -; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8 -; GFX9-NEXT: v_mul_lo_u32 v13, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v16, v5, v7 -; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v14, vcc +; GFX9-NEXT: v_add3_u32 v8, v10, v9, v8 +; GFX9-NEXT: v_mul_lo_u32 v11, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v12, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v13, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v10, v6, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v15, v13, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v11, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v12, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v8, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 @@ -998,31 +988,29 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v10, vcc -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 -; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 +; GFX9-NEXT: v_add3_u32 v7, v9, v8, v7 ; GFX9-NEXT: v_mul_lo_u32 v10, v4, v7 ; GFX9-NEXT: v_mul_hi_u32 v11, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v15, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v14, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v14, v4, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v5, v7 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v13, v15, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v14, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v7 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -372,9 +372,9 @@ ; GCN-ISEL-LABEL: body: ; GCN-ISEL-LABEL: bb.3 ; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 -; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]] +; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, killed %{{[0-9]+}}, killed %[[CARRY]] ; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64 -; GCN-ISEL: S_SUB_CO_PSEUDO %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]] +; GCN-ISEL: S_SUB_CO_PSEUDO killed %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]] define amdgpu_kernel void @sudiv64(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = udiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -5,36 +5,39 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s5, 31 -; GCN-NEXT: s_add_u32 s4, s4, s2 -; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s5, s5, s2 -; GCN-NEXT: s_xor_b64 s[12:13], s[4:5], s[2:3] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s4, 0, s12 -; GCN-NEXT: s_subb_u32 s5, 0, s13 -; GCN-NEXT: s_ashr_i32 s14, s11, 31 +; GCN-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s8 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_addc_u32 s3, s3, s8 +; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GCN-NEXT: s_sub_u32 s4, 0, s10 +; GCN-NEXT: s_subb_u32 s5, 0, s11 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 s15, s14 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s12, s3, 31 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_add_u32 s2, s2, s12 +; GCN-NEXT: s_mov_b32 s13, s12 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: s_addc_u32 s3, s3, s12 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 @@ -46,73 +49,69 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v3 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, s4, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v3, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v3, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v7, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1] -; GCN-NEXT: s_add_u32 s0, s10, s14 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: s_addc_u32 s1, s11, s14 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v6, s3, v2 +; GCN-NEXT: v_mul_lo_u32 v2, s3, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mov_b32 s4, s8 +; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s13, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, s12, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s10, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -120,18 +119,18 @@ ; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s11 +; GCN-NEXT: v_mov_b32_e32 v6, s3 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[2:3] +; GCN-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -291,32 +290,30 @@ ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v16, v14, vcc ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v11, vcc -; GCN-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 -; GCN-NEXT: v_addc_u32_e64 v9, vcc, v6, v10, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v11, v7, v9 -; GCN-NEXT: v_mul_hi_u32 v12, v7, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v9, v7, v6 +; GCN-NEXT: v_mul_hi_u32 v10, v7, v5 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v5 ; GCN-NEXT: v_mul_lo_u32 v7, v7, v5 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_mul_lo_u32 v11, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v12, v5, v7 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v10, v6, v7 +; GCN-NEXT: v_mul_lo_u32 v7, v6, v7 +; GCN-NEXT: v_mul_hi_u32 v9, v6, v8 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GCN-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GCN-NEXT: v_mul_lo_u32 v13, v5, v8 -; GCN-NEXT: v_mul_hi_u32 v16, v5, v7 -; GCN-NEXT: v_mul_hi_u32 v17, v5, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v9, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v9, v7 -; GCN-NEXT: v_mul_hi_u32 v11, v9, v8 -; GCN-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, v15, v17, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v16, v12, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v12, vcc, v15, v13, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v6, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v10, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v15, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e64 v6, vcc, v6, v8, s[4:5] ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v7 @@ -1115,19 +1112,20 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_sdiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s7, 31 -; GCN-NEXT: s_add_u32 s0, s6, s2 -; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s1, s7, s2 -; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s3, 0, s8 -; GCN-NEXT: s_subb_u32 s6, 0, s9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s8 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_addc_u32 s3, s3, s8 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -1137,10 +1135,10 @@ ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s3, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v7, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 @@ -1157,75 +1155,74 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, s3, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s6, v0 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v2, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_mul_lo_u32 v3, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 24, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v3 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1362,32 +1359,30 @@ ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v9, vcc -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 -; GCN-NEXT: v_addc_u32_e64 v7, vcc, v4, v8, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v9, v5, v7 -; GCN-NEXT: v_mul_hi_u32 v10, v5, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v5, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v3 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v3 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_mul_lo_u32 v9, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v10, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v4, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v4, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v6 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_mul_lo_u32 v11, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v14, v3, v5 -; GCN-NEXT: v_mul_hi_u32 v15, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v7, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v7, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v7, v6 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, v13, v15, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v14, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, v13, v11, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v4, v6, s[4:5] ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v4, 24 ; GCN-NEXT: v_mul_hi_u32 v3, v3, 24 ; GCN-NEXT: v_mul_hi_u32 v4, v4, 24 @@ -1540,6 +1535,7 @@ ; GCN-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1566,32 +1562,30 @@ ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v9, vcc -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 -; GCN-NEXT: v_addc_u32_e64 v7, vcc, v4, v8, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v9, v5, v7 -; GCN-NEXT: v_mul_hi_u32 v10, v5, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v5, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v3 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v3 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_mul_lo_u32 v9, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v10, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v4, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v4, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v6 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_mul_lo_u32 v11, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v14, v3, v5 -; GCN-NEXT: v_mul_hi_u32 v15, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v7, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v7, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v7, v6 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, v13, v15, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v14, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, v13, v11, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v4, v6, s[4:5] ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v5, 17, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 17, v3 @@ -1599,7 +1593,6 @@ ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v13, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 -; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -13,8 +13,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s2, 0, s12 -; GCN-NEXT: s_subb_u32 s3, 0, s13 +; GCN-NEXT: s_sub_u32 s0, 0, s12 +; GCN-NEXT: s_subb_u32 s1, 0, s13 ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 @@ -26,10 +26,10 @@ ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s0, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v7, s1, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s0, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 @@ -46,32 +46,30 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s0, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v2, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v4, s10, v3 ; GCN-NEXT: v_mul_hi_u32 v5, s10, v0 ; GCN-NEXT: v_mul_hi_u32 v6, s10, v3 @@ -269,32 +267,30 @@ ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v10, vcc -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 -; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v10, v6, v8 -; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v6, v5 +; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v7, v7, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_mul_lo_u32 v10, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v11, v4, v6 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v9, v5, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v5, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v5, v7 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_mul_lo_u32 v12, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v15, v4, v6 -; GCN-NEXT: v_mul_hi_u32 v16, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v11, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v8, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v15, v11, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, v14, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v14, v8, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v7, s[4:5] ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v6 @@ -889,23 +885,25 @@ ; GCN-NEXT: s_xor_b64 s[12:13], s[4:5], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s4, 0, s12 -; GCN-NEXT: s_subb_u32 s5, 0, s13 +; GCN-NEXT: s_sub_u32 s0, 0, s12 +; GCN-NEXT: s_subb_u32 s1, 0, s13 ; GCN-NEXT: s_ashr_i32 s10, s11, 31 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: s_mov_b32 s5, s9 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 @@ -922,35 +920,32 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v3 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, s4, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v3, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v3, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v7, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1] +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc ; GCN-NEXT: s_add_u32 s0, s2, s10 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GCN-NEXT: s_addc_u32 s1, s3, s10 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc ; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[10:11] ; GCN-NEXT: v_mul_lo_u32 v3, s14, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s14, v0 @@ -961,7 +956,6 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v5, s15, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s15, v0 -; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc @@ -1295,23 +1289,25 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_srem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s0, s7, 31 -; GCN-NEXT: s_add_u32 s2, s6, s0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s7, s0 -; GCN-NEXT: s_xor_b64 s[8:9], s[2:3], s[0:1] +; GCN-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s4 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_addc_u32 s3, s3, s4 +; GCN-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_sub_u32 s2, 0, s8 ; GCN-NEXT: s_subb_u32 s3, 0, s9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1338,32 +1334,30 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s3, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s2, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v2, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1539,32 +1533,30 @@ ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 -; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v4, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v10, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v13, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v12, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v5, s[4:5] ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 ; GCN-NEXT: v_mul_hi_u32 v2, v2, 24 ; GCN-NEXT: v_mul_hi_u32 v3, v3, 24 @@ -1715,6 +1707,7 @@ ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1741,32 +1734,30 @@ ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 -; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v4, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v10, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v13, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v12, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v5, s[4:5] ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v4, 17, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2 @@ -1775,7 +1766,6 @@ ; GCN-NEXT: v_mul_lo_u32 v3, v1, v2 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v0, v2 -; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -5,16 +5,16 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s4, 0, s2 -; GCN-NEXT: s_subb_u32 s5, 0, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GCN-NEXT: s_sub_u32 s4, 0, s8 +; GCN-NEXT: s_subb_u32 s5, 0, s9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -44,65 +44,63 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, s4, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_mul_lo_u32 v7, s4, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s10, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v6, s10, v3 -; GCN-NEXT: v_mul_hi_u32 v7, s11, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v2, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v6, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v7, s3, v3 +; GCN-NEXT: v_mul_lo_u32 v3, s3, v3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mov_b32 s4, s8 +; GCN-NEXT: v_mul_lo_u32 v6, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s9, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s8, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -110,13 +108,13 @@ ; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s11 +; GCN-NEXT: v_mov_b32_e32 v6, s3 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] @@ -254,32 +252,30 @@ ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v10, vcc -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 -; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v10, v6, v8 -; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v6, v5 +; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v7, v7, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_mul_lo_u32 v10, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v11, v4, v6 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v9, v5, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v5, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v5, v7 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_mul_lo_u32 v12, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v15, v4, v6 -; GCN-NEXT: v_mul_hi_u32 v16, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v11, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v8, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v15, v11, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, v14, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v14, v8, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v7, s[4:5] ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v5 @@ -719,14 +715,14 @@ ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: s_sub_u32 s2, 0, s0 -; GCN-NEXT: s_subb_u32 s3, 0, s1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v1 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v2 -; GCN-NEXT: v_mul_lo_u32 v5, s3, v1 +; GCN-NEXT: s_sub_u32 s0, 0, s0 +; GCN-NEXT: s_subb_u32 s1, 0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v4, s0, v2 +; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v1 +; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v4 @@ -742,33 +738,31 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc -; GCN-NEXT: v_add_i32_e64 v1, s[0:1], v1, v3 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, s2, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s2, v1 -; GCN-NEXT: v_mul_lo_u32 v7, s3, v1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, s2, v1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GCN-NEXT: v_mul_lo_u32 v11, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v1, v6 -; GCN-NEXT: v_mul_hi_u32 v13, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v10, v3, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v12, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1] +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v10, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v11, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, v9, v11, vcc +; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc ; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v1, v3, v1 @@ -919,30 +913,28 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GCN-NEXT: s_sub_u32 s2, 0, s6 -; GCN-NEXT: s_subb_u32 s3, 0, s7 -; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v7, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 @@ -959,71 +951,71 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v2, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_mul_lo_u32 v3, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 24, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s6, v3 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v5 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv_k_num_i64: @@ -1148,32 +1140,30 @@ ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 -; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v4, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v10, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v13, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v12, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v5, s[4:5] ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v1, v2 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 @@ -1390,7 +1380,7 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xffe8 +; GCN-NEXT: s_movk_i32 s4, 0xffe8 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1399,12 +1389,13 @@ ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -1421,43 +1412,40 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc @@ -1471,8 +1459,8 @@ ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 ; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 24, v8 ; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc @@ -1578,7 +1566,7 @@ ; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: s_movk_i32 s6, 0xffe8 +; GCN-NEXT: s_movk_i32 s4, 0xffe8 ; GCN-NEXT: v_mov_b32_e32 v10, 0 ; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 @@ -1587,9 +1575,9 @@ ; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v2, s6 -; GCN-NEXT: v_mul_lo_u32 v5, v3, s6 -; GCN-NEXT: v_mul_lo_u32 v6, v2, s6 +; GCN-NEXT: v_mul_hi_u32 v4, v2, s4 +; GCN-NEXT: v_mul_lo_u32 v5, v3, s4 +; GCN-NEXT: v_mul_lo_u32 v6, v2, s4 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v7, v2, v4 @@ -1606,31 +1594,29 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[4:5] -; GCN-NEXT: v_mul_hi_u32 v6, v2, s6 -; GCN-NEXT: v_mul_lo_u32 v7, v4, s6 -; GCN-NEXT: v_mul_lo_u32 v8, v2, s6 -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v10, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v9, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[4:5] ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_hi_u32 v4, v2, s4 +; GCN-NEXT: v_mul_lo_u32 v5, v3, s4 +; GCN-NEXT: v_mul_lo_u32 v6, v2, s4 +; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, v10, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -13,8 +13,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s2, 0, s12 -; GCN-NEXT: s_subb_u32 s3, 0, s13 +; GCN-NEXT: s_sub_u32 s0, 0, s12 +; GCN-NEXT: s_subb_u32 s1, 0, s13 ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 @@ -26,10 +26,10 @@ ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s0, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v7, s1, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s0, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 @@ -46,32 +46,30 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s0, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v2, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v4, s10, v3 ; GCN-NEXT: v_mul_hi_u32 v5, s10, v0 ; GCN-NEXT: v_mul_hi_u32 v6, s10, v3 @@ -264,32 +262,30 @@ ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v10, vcc -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 -; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v10, v6, v8 -; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v6, v5 +; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v7, v7, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_mul_lo_u32 v10, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v11, v4, v6 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v9, v5, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v5, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v5, v7 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_mul_lo_u32 v12, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v15, v4, v6 -; GCN-NEXT: v_mul_hi_u32 v16, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v11, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v8, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v15, v11, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, v14, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v14, v8, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v7, s[4:5] ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v5 @@ -747,8 +743,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GCN-NEXT: s_sub_u32 s2, 0, s6 -; GCN-NEXT: s_subb_u32 s3, 0, s7 +; GCN-NEXT: s_sub_u32 s0, 0, s6 +; GCN-NEXT: s_subb_u32 s1, 0, s7 ; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 @@ -760,10 +756,10 @@ ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s0, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v7, s1, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s0, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 @@ -780,32 +776,30 @@ ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s0, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v2, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -941,7 +935,7 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xffe8 +; GCN-NEXT: s_movk_i32 s4, 0xffe8 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -950,12 +944,13 @@ ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -972,43 +967,40 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc @@ -1018,8 +1010,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 ; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc @@ -1039,7 +1031,7 @@ ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: @@ -1173,32 +1165,30 @@ ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 -; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v4, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v10, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v13, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v12, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v5, s[4:5] ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v1, v2 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2