Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -111,6 +111,7 @@ cl::Hidden); namespace { + class JoinVals; class RegisterCoalescer : public MachineFunctionPass, private LiveRangeEdit::Delegate { @@ -209,12 +210,16 @@ /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange /// lanemasks already adjusted to the coalesced register. void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge, - LaneBitmask LaneMask, CoalescerPair &CP); + LaneBitmask LaneMask, CoalescerPair &CP, + const JoinVals &LHSMainVals, + const JoinVals &RHSMainVals); /// Join the liveranges of two subregisters. Joins @p RRange into /// @p LRange, @p RRange may be invalid afterwards. void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, - LaneBitmask LaneMask, const CoalescerPair &CP); + LaneBitmask LaneMask, const CoalescerPair &CP, + const JoinVals &LHSMainVals, + const JoinVals &RHSMainVals); /// We found a non-trivially-coalescable copy. If the source value number is /// defined by a copy from the destination reg see if we can merge these two @@ -2171,9 +2176,9 @@ /// be smaller than the lanemask produced by SubIdx when merging subranges. const LaneBitmask LaneMask; - /// This is true when joining sub register ranges, false when joining main - /// ranges. - const bool SubRangeJoin; + /// This is nullptr when joining main ranges, or a pointer to the main range + /// JoinVals when joining sub register ranges. + const JoinVals *MainVals; /// Whether the current LiveInterval tracks subregister liveness. const bool TrackSubRegLiveness; @@ -2272,6 +2277,13 @@ /// One entry per value number in LI. SmallVector Vals; + /// Set of def location of any val in a main range that resolves to an + /// "identical CR_Erase". + std::set IdenticalErases; + + /// Return whether this is a subrange join. + bool isSubRangeJoin() const { return MainVals != nullptr; } + /// Compute the bitmask of lanes actually written by DefMI. /// Set Redef if there are any partial register definitions that depend on the /// previous value of the register. @@ -2330,10 +2342,10 @@ public: JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, LaneBitmask LaneMask, SmallVectorImpl &newVNInfo, const CoalescerPair &cp, - LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin, - bool TrackSubRegLiveness) + LiveIntervals *lis, const TargetRegisterInfo *TRI, + const JoinVals *MainVals, bool TrackSubRegLiveness) : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask), - SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness), + MainVals(MainVals), TrackSubRegLiveness(TrackSubRegLiveness), NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()), TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums()) {} @@ -2412,28 +2424,8 @@ const LiveInterval &LI = LIS->getInterval(SrcReg); const VNInfo *ValueIn; - // No subrange involved. - if (!SubRangeJoin || !LI.hasSubRanges()) { - LiveQueryResult LRQ = LI.Query(Def); - ValueIn = LRQ.valueIn(); - } else { - // Query subranges. Ensure that all matching ones take us to the same def - // (allowing some of them to be undef). - ValueIn = nullptr; - for (const LiveInterval::SubRange &S : LI.subranges()) { - // Transform lanemask to a mask in the joined live interval. - LaneBitmask SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask); - if ((SMask & LaneMask).none()) - continue; - LiveQueryResult LRQ = S.Query(Def); - if (!ValueIn) { - ValueIn = LRQ.valueIn(); - continue; - } - if (LRQ.valueIn() && ValueIn != LRQ.valueIn()) - return std::make_pair(VNI, TrackReg); - } - } + LiveQueryResult LRQ = LI.Query(Def); + ValueIn = LRQ.valueIn(); if (ValueIn == nullptr) { // Reaching an undefined value is legitimate, for example: // @@ -2487,13 +2479,13 @@ const MachineInstr *DefMI = nullptr; if (VNI->isPHIDef()) { // Conservatively assume that all lanes in a PHI are valid. - LaneBitmask Lanes = SubRangeJoin ? LaneBitmask::getLane(0) - : TRI->getSubRegIndexLaneMask(SubIdx); + LaneBitmask Lanes = isSubRangeJoin() ? LaneBitmask::getLane(0) + : TRI->getSubRegIndexLaneMask(SubIdx); V.ValidLanes = V.WriteLanes = Lanes; } else { DefMI = Indexes->getInstructionFromIndex(VNI->def); assert(DefMI != nullptr); - if (SubRangeJoin) { + if (isSubRangeJoin()) { // We don't care about the lanes when joining subregister ranges. V.WriteLanes = V.ValidLanes = LaneBitmask::getLane(0); if (DefMI->isImplicitDef()) { @@ -2640,10 +2632,21 @@ // %other = COPY %ext // %this = COPY %ext <-- Erase this copy // - if (DefMI->isFullCopy() && !CP.isPartial() && - valuesIdentical(VNI, V.OtherVNI, Other)) { - V.Identical = true; - return CR_Erase; + // In a main range, detect this by calling valuesIdentical. In a subrange, + // inherit this identical CR_Erase property from the main range. + // + if (DefMI->isFullCopy() && !CP.isPartial()) { + if (!isSubRangeJoin()) { + if (valuesIdentical(VNI, V.OtherVNI, Other)) { + IdenticalErases.insert(VNI->def); + V.Identical = true; + return CR_Erase; + } + } else if (MainVals->IdenticalErases.find(VNI->def) + != MainVals->IdenticalErases.end()) { + V.Identical = true; + return CR_Erase; + } } // If the lanes written by this instruction were all undef in OtherVNI, it is @@ -2816,7 +2819,7 @@ continue; LLVM_DEBUG(dbgs() << "\t\tconflict at " << printReg(Reg) << ':' << i << '@' << LR.getValNumInfo(i)->def << '\n'); - if (SubRangeJoin) + if (isSubRangeJoin()) return false; ++NumLaneConflicts; @@ -3178,12 +3181,14 @@ void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, LaneBitmask LaneMask, - const CoalescerPair &CP) { + const CoalescerPair &CP, + const JoinVals &LHSMainVals, + const JoinVals &RHSMainVals) { SmallVector NewVNInfo; JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask, - NewVNInfo, CP, LIS, TRI, true, true); + NewVNInfo, CP, LIS, TRI, &RHSMainVals, true); JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask, - NewVNInfo, CP, LIS, TRI, true, true); + NewVNInfo, CP, LIS, TRI, &LHSMainVals, true); // Compute NewVNInfo and resolve conflicts (see also joinVirtRegs()) // We should be able to resolve all conflicts here as we could successfully do @@ -3242,16 +3247,20 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge, LaneBitmask LaneMask, - CoalescerPair &CP) { + CoalescerPair &CP, + const JoinVals &LHSMainVals, + const JoinVals &RHSMainVals) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); LI.refineSubRanges(Allocator, LaneMask, - [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) { + [this,&Allocator,&ToMerge,&CP,&LHSMainVals,&RHSMainVals]( + LiveInterval::SubRange &SR) { if (SR.empty()) { SR.assign(ToMerge, Allocator); } else { // joinSubRegRange() destroys the merged range, so we need a copy. LiveRange RangeCopy(ToMerge, Allocator); - joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP); + joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP, + LHSMainVals, RHSMainVals); } }); } @@ -3262,9 +3271,9 @@ LiveInterval &LHS = LIS->getInterval(CP.getDstReg()); bool TrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(*CP.getNewRC()); JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), LaneBitmask::getNone(), - NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness); + NewVNInfo, CP, LIS, TRI, nullptr, TrackSubRegLiveness); JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), LaneBitmask::getNone(), - NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness); + NewVNInfo, CP, LIS, TRI, nullptr, TrackSubRegLiveness); LLVM_DEBUG(dbgs() << "\t\tRHS = " << RHS << "\n\t\tLHS = " << LHS << '\n'); @@ -3305,12 +3314,12 @@ if (!RHS.hasSubRanges()) { LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask() : TRI->getSubRegIndexLaneMask(SrcIdx); - mergeSubRangeInto(LHS, RHS, Mask, CP); + mergeSubRangeInto(LHS, RHS, Mask, CP, LHSVals, RHSVals); } else { // Pair up subranges and merge. for (LiveInterval::SubRange &R : RHS.subranges()) { LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask); - mergeSubRangeInto(LHS, R, Mask, CP); + mergeSubRangeInto(LHS, R, Mask, CP, LHSVals, RHSVals); } } LLVM_DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n"); Index: test/CodeGen/AMDGPU/regcoal-followcopychain-bogus-subrange-comparison.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/regcoal-followcopychain-bogus-subrange-comparison.mir @@ -0,0 +1,154 @@ +# RUN: llc -mtriple=amdgcn--amdgcn -mcpu=gfx803 -run-pass simple-register-coalescing -o - %s | FileCheck --check-prefix=GCN %s +# +# This test will provoke a Couldn't join subrange unreachable in debug mode +# without the associated fix. The check below is intended to check that +# 800B %28:vreg_128 = COPY %11:vreg_128 +# gets coalesced. +# +# GCN: body: +# GCN-NOT: %28 + +--- +name: _amdgpu_cs_main +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +liveins: +fixedStack: +stack: +constants: +body: | + bb.0: + successors: %bb.1(0x80000000) + + %27:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, undef %28:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec + %30:vgpr_32 = V_MIN_F32_e32 1065353216, killed %27, implicit $exec + %31:sreg_64_xexec = V_CMP_NEQ_F32_e64 0, 1065353216, 0, killed %30, 0, implicit $exec + %32:vgpr_32 = V_MOV_B32_e32 2143289344, implicit $exec + %34:vgpr_32 = V_CNDMASK_B32_e64 0, killed %32, killed %31, implicit $exec + %35:sreg_64_xexec = V_CMP_LT_F32_e64 0, 0, 0, killed %34, 0, implicit $exec + %23:sreg_64 = S_MOV_B64 0 + %38:sreg_32_xm0 = S_MOV_B32 0 + %107:sreg_64 = COPY killed %23 + %108:vreg_128 = IMPLICIT_DEF + %109:vreg_128 = IMPLICIT_DEF + + bb.1: + successors: %bb.2(0x40000000), %bb.4(0x40000000) + + %85:vreg_128 = COPY killed %109 + %2:vreg_128 = COPY killed %108 + %1:sreg_64 = COPY killed %107 + %110:vreg_128 = COPY %85 + %4:sreg_64 = COPY $exec, implicit-def $exec + %112:sreg_64 = S_AND_B64 %4, %35, implicit-def dead $scc + $exec = S_MOV_B64_term killed %112 + SI_MASK_BRANCH %bb.4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3(0x40000000), %bb.5(0x40000000) + + S_CBRANCH_SCC1 %bb.5, implicit undef $scc + S_BRANCH %bb.3 + + bb.3: + successors: %bb.5(0x80000000) + + undef %39.sub0:sreg_256 = COPY %38 + %39.sub1:sreg_256 = COPY %38 + %39.sub2:sreg_256 = COPY %38 + %39.sub3:sreg_256 = COPY %38 + %39.sub4:sreg_256 = COPY %38 + %39.sub5:sreg_256 = COPY %38 + %39.sub6:sreg_256 = COPY %38 + %39.sub7:sreg_256 = COPY %38 + dead %40:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V4 undef %41:vreg_128, killed %39, undef %42:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, addrspace 4) + S_BRANCH %bb.5 + + bb.4: + successors: %bb.6(0x80000000) + + $exec = S_OR_B64 $exec, killed %4, implicit-def $scc + %89:vreg_128 = COPY killed %110 + S_BRANCH %bb.6 + + bb.5: + successors: %bb.4(0x80000000) + + %51:vgpr_32 = COPY killed %2.sub1 + %53:vgpr_32 = COPY killed %51 + %53:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel32-lo) 0, undef %100.sub1:vreg_128, %53, implicit $exec + %88:vreg_128 = COPY killed %85 + %88.sub1:vreg_128 = COPY killed %53 + %110:vreg_128 = COPY killed %88 + S_BRANCH %bb.4 + + bb.6: + successors: %bb.7(0x04000000), %bb.1(0x7c000000) + + %95:vgpr_32 = V_ADD_I32_e32 1, %89.sub3, implicit-def dead $vcc, implicit $exec + %91:vreg_128 = COPY killed %89 + %91.sub3:vreg_128 = COPY %95 + %59:sreg_64 = V_CMP_LT_U32_e64 3, killed %95, implicit $exec + %11:vreg_128 = COPY %91 + %60:vgpr_32 = V_MOV_B32_e32 953267991, implicit $exec + %61:sreg_64_xexec = V_CMP_GT_F32_e64 0, %91.sub2, 0, %60, 0, implicit $exec + %62:sreg_64 = V_CMP_NGT_F32_e64 0, %91.sub2, 0, killed %60, 0, implicit $exec + %13:vgpr_32 = V_CNDMASK_B32_e64 0, -1, killed %61, implicit $exec + %63:sreg_64 = S_OR_B64 killed %59, killed %62, implicit-def dead $scc + %14:sreg_64 = S_AND_B64 $exec, killed %63, implicit-def $scc + %14:sreg_64 = S_OR_B64 %14, killed %1, implicit-def $scc + %107:sreg_64 = COPY %14 + %108:vreg_128 = COPY %11 + %109:vreg_128 = COPY %11 + $exec = S_ANDN2_B64_term $exec, %14 + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.7 + + bb.7: + successors: %bb.8(0x40000000), %bb.9(0x40000000) + + $exec = S_OR_B64 $exec, killed %14, implicit-def $scc + %64:sreg_64 = V_CMP_NE_U32_e64 0, killed %13, implicit $exec + %111:vreg_128 = COPY %91 + %16:sreg_64 = COPY $exec, implicit-def $exec + %113:sreg_64 = S_AND_B64 %16, %64, implicit-def dead $scc + $exec = S_MOV_B64_term killed %113 + SI_MASK_BRANCH %bb.9, implicit $exec + S_BRANCH %bb.8 + + bb.8: + successors: %bb.9(0x80000000) + + %65:vgpr_32 = COPY killed %11.sub1 + %67:vgpr_32 = V_MAD_F32 0, target-flags(amdgpu-gotprel32-hi) 0, 0, killed %91.sub2, 0, killed %65, 0, 0, implicit $exec + undef %102.sub1:vreg_128 = COPY killed %67 + %93:vreg_128 = COPY killed %102 + %111:vreg_128 = COPY killed %93 + + bb.9: + $exec = S_OR_B64 $exec, killed %16, implicit-def $scc + %92:vreg_128 = COPY killed %111 + %77:vgpr_32 = V_MUL_F32_e32 target-flags(amdgpu-gotprel32-lo) 0, killed %92.sub1, implicit $exec + undef %106.sub0:vreg_128 = COPY %77 + %106.sub1:vreg_128 = COPY %77 + %106.sub2:vreg_128 = COPY %77 + %106.sub3:vreg_128 = COPY killed %77 + %79:sreg_32_xm0 = S_MOV_B32 0 + undef %80.sub0:sreg_256 = COPY %79 + %80.sub1:sreg_256 = COPY %79 + %80.sub2:sreg_256 = COPY %79 + %80.sub3:sreg_256 = COPY %79 + %80.sub4:sreg_256 = COPY %79 + %80.sub5:sreg_256 = COPY %79 + %80.sub6:sreg_256 = COPY %79 + %80.sub7:sreg_256 = COPY killed %79 + IMAGE_STORE_V4_V2 killed %106, undef %82:vreg_64, killed %80, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into constant-pool, addrspace 4) + S_ENDPGM + +... Index: test/CodeGen/AMDGPU/regcoal-followcopychain-different-subreg-diffs.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/regcoal-followcopychain-different-subreg-diffs.mir @@ -0,0 +1,134 @@ +# RUN: llc -mtriple=amdgcn--amdgcn -mcpu=gfx803 -run-pass simple-register-coalescing -o - %s | FileCheck --check-prefix=GCN %s +# +# This test will provoke a Couldn't join subrange unreachable in debug mode +# without the associated fix. The check below is intended to check that +# 480B %16:vreg_128 = COPY %5:vreg_128 +# gets coalesced. +# +# GCN: body: +# GCN-NOT: %5 + +--- +name: _amdgpu_ps_main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0', virtual-reg: '%16' } +body: | + bb.0: + successors: %bb.3, %bb.1 + liveins: $sgpr0 + + %16:sgpr_32 = COPY killed $sgpr0 + $m0 = S_MOV_B32 killed %16 + %19:vgpr_32 = V_INTERP_P2_F32 undef %19, undef %21:vgpr_32, 0, 1, implicit $m0, implicit $exec + %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %23:vgpr_32 = V_MAD_F32 0, killed %19, 0, 0, 0, 0, 0, 0, implicit $exec + undef %75.sub0:vreg_128 = COPY %24 + %75.sub2:vreg_128 = COPY %24 + %26:vgpr_32 = nnan arcp contract reassoc V_MUL_F32_e64 0, 0, 0, killed %23, 1, 0, implicit $exec + %0:vgpr_32 = V_MUL_F32_e32 0, killed %26, implicit $exec + %28:sreg_64 = V_CMP_NLT_F32_e64 0, 0, 0, killed %0, 0, implicit $exec + %89:vreg_128 = IMPLICIT_DEF + %91:sreg_64 = COPY $exec, implicit-def $exec + %92:sreg_64 = S_AND_B64 %91, %28, implicit-def dead $scc + %2:sreg_64 = S_XOR_B64 %92, %91, implicit-def dead $scc + $exec = S_MOV_B64_term killed %92 + SI_MASK_BRANCH %bb.1, implicit $exec + S_BRANCH %bb.3 + + bb.1: + successors: %bb.2, %bb.4 + + %93:sreg_64 = COPY killed %2 + %5:sreg_64 = S_OR_SAVEEXEC_B64 %93, implicit-def $exec, implicit-def $scc, implicit $exec + %3:vreg_128 = COPY killed %89 + %90:vreg_128 = COPY killed %3 + $exec = S_XOR_B64_term $exec, %5, implicit-def $scc + SI_MASK_BRANCH %bb.4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + dead %37:sreg_32 = S_MOV_B32 0 + %6:vreg_128 = COPY killed %75 + %6.sub3:vreg_128 = COPY undef %37 + %90:vreg_128 = COPY killed %6 + S_BRANCH %bb.4 + + bb.3: + %1:vreg_128 = COPY %75 + %89:vreg_128 = COPY killed %1 + S_BRANCH %bb.1 + + bb.4: + $exec = S_OR_B64 $exec, killed %5, implicit-def $scc + %8:vreg_128 = COPY killed %90 + S_CBRANCH_SCC1 %bb.10, implicit undef $scc + S_BRANCH %bb.5 + + bb.5: + successors: %bb.6, %bb.7 + + S_CBRANCH_SCC1 %bb.7, implicit undef $scc + S_BRANCH %bb.6 + + bb.6: + %43:vgpr_32 = V_MAD_F32 0, %8.sub0, 0, target-flags(amdgpu-gotprel) 0, 0, 0, 0, 0, implicit $exec + %44:vgpr_32 = COPY killed %43 + %44:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel32-hi) 0, killed %8.sub2, %44, implicit $exec + %45:vgpr_32 = V_ADD_F32_e32 0, killed %44, implicit $exec + %47:vgpr_32 = V_MOV_B32_e32 1107296256, implicit $exec + %48:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, killed %45, 0, killed %47, 0, 1056964608, 0, 0, implicit $exec + %49:vgpr_32 = V_FLOOR_F32_e32 killed %48, implicit $exec + %50:vgpr_32 = V_CVT_I32_F32_e32 killed %49, implicit $exec + %81:vgpr_32 = V_ADD_I32_e32 1, killed %50, implicit-def dead $vcc, implicit $exec + dead %82:vgpr_32 = V_MIN_I32_e32 31, killed %81, implicit $exec + + bb.7: + successors: %bb.8, %bb.9 + + S_CBRANCH_SCC1 %bb.9, implicit undef $scc + S_BRANCH %bb.8 + + bb.8: + + bb.9: + + bb.10: + S_CBRANCH_SCC1 %bb.17, implicit undef $scc + S_BRANCH %bb.11 + + bb.11: + + bb.12: + successors: %bb.13, %bb.15 + + S_CBRANCH_SCC1 %bb.15, implicit undef $scc + S_BRANCH %bb.13 + + bb.13: + successors: %bb.14, %bb.15 + + S_CBRANCH_SCC1 %bb.15, implicit undef $scc + S_BRANCH %bb.14 + + bb.14: + + bb.15: + successors: %bb.16, %bb.12 + + S_CBRANCH_SCC1 %bb.12, implicit undef $scc + S_BRANCH %bb.16 + + bb.16: + successors: %bb.12(0x7c000000), %bb.17(0x04000000) + + dead %86:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %74:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc + $vcc = COPY killed %74 + S_CBRANCH_VCCNZ %bb.12, implicit killed $vcc + S_BRANCH %bb.17 + + bb.17: + S_ENDPGM + +...