Index: llvm/trunk/include/llvm/CodeGen/CalcSpillWeights.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/CalcSpillWeights.h +++ llvm/trunk/include/llvm/CodeGen/CalcSpillWeights.h @@ -20,6 +20,7 @@ class LiveIntervals; class MachineBlockFrequencyInfo; class MachineLoopInfo; + class VirtRegMap; /// \brief Normalize the spill weight of a live interval /// @@ -51,6 +52,7 @@ private: MachineFunction &MF; LiveIntervals &LIS; + VirtRegMap *VRM; const MachineLoopInfo &Loops; const MachineBlockFrequencyInfo &MBFI; DenseMap Hint; @@ -58,10 +60,10 @@ public: VirtRegAuxInfo(MachineFunction &mf, LiveIntervals &lis, - const MachineLoopInfo &loops, + VirtRegMap *vrm, const MachineLoopInfo &loops, const MachineBlockFrequencyInfo &mbfi, NormalizingFn norm = normalizeSpillWeight) - : MF(mf), LIS(lis), Loops(loops), MBFI(mbfi), normalize(norm) {} + : MF(mf), LIS(lis), VRM(vrm), Loops(loops), MBFI(mbfi), normalize(norm) {} /// \brief (re)compute li's spill weight and allocation hint. void calculateSpillWeightAndHint(LiveInterval &li); @@ -70,6 +72,7 @@ /// \brief Compute spill weights and allocation hints for all virtual register /// live intervals. void calculateSpillWeightsAndHints(LiveIntervals &LIS, MachineFunction &MF, + VirtRegMap *VRM, const MachineLoopInfo &MLI, const MachineBlockFrequencyInfo &MBFI, VirtRegAuxInfo::NormalizingFn norm = Index: llvm/trunk/lib/CodeGen/CalcSpillWeights.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CalcSpillWeights.cpp +++ llvm/trunk/lib/CodeGen/CalcSpillWeights.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -24,6 +25,7 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS, MachineFunction &MF, + VirtRegMap *VRM, const MachineLoopInfo &MLI, const MachineBlockFrequencyInfo &MBFI, VirtRegAuxInfo::NormalizingFn norm) { @@ -31,7 +33,7 @@ << "********** Function: " << MF.getName() << '\n'); MachineRegisterInfo &MRI = MF.getRegInfo(); - VirtRegAuxInfo VRAI(MF, LIS, MLI, MBFI, norm); + VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm); for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) { unsigned Reg = TargetRegisterInfo::index2VirtReg(i); if (MRI.reg_nodbg_empty(Reg)) @@ -74,7 +76,10 @@ // Check if all values in LI are rematerializable static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, + VirtRegMap *VRM, const TargetInstrInfo &TII) { + unsigned Reg = LI.reg; + unsigned Original = VRM ? VRM->getOriginal(Reg) : 0; for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I != E; ++I) { const VNInfo *VNI = *I; @@ -86,6 +91,36 @@ MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def); assert(MI && "Dead valno in interval"); + // Trace copies introduced by live range splitting. The inline + // spiller can rematerialize through these copies, so the spill + // weight must reflect this. + if (VRM) { + while (MI->isFullCopy()) { + // The copy destination must match the interval register. + if (MI->getOperand(0).getReg() != Reg) + return false; + + // Get the source register. + Reg = MI->getOperand(1).getReg(); + + // If the original (pre-splitting) registers match this + // copy came from a split. + if (!TargetRegisterInfo::isVirtualRegister(Reg) || + VRM->getOriginal(Reg) != Original) + return false; + + // Follow the copy live-in value. + const LiveInterval &SrcLI = LIS.getInterval(Reg); + LiveQueryResult SrcQ = SrcLI.Query(VNI->def); + VNI = SrcQ.valueIn(); + assert(VNI && "Copy from non-existing value"); + if (VNI->isPHIDef()) + return false; + MI = LIS.getInstructionFromIndex(VNI->def); + assert(MI && "Dead valno in interval"); + } + } + if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis())) return false; } @@ -188,7 +223,7 @@ // it is a preferred candidate for spilling. // FIXME: this gets much more complicated once we support non-trivial // re-materialization. - if (isRematerializable(li, LIS, *MF.getSubtarget().getInstrInfo())) + if (isRematerializable(li, LIS, VRM, *MF.getSubtarget().getInstrInfo())) totalWeight *= 0.5F; li.weight = normalize(totalWeight, li.getSize(), numInstr); Index: llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp =================================================================== --- llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp +++ llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp @@ -411,7 +411,7 @@ LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF, const MachineLoopInfo &Loops, const MachineBlockFrequencyInfo &MBFI) { - VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI); + VirtRegAuxInfo VRAI(MF, LIS, VRM, Loops, MBFI); for (unsigned I = 0, Size = size(); I < Size; ++I) { LiveInterval &LI = LIS.getInterval(get(I)); if (MRI.recomputeRegClass(LI.reg)) Index: llvm/trunk/lib/CodeGen/RegAllocBasic.cpp =================================================================== --- llvm/trunk/lib/CodeGen/RegAllocBasic.cpp +++ llvm/trunk/lib/CodeGen/RegAllocBasic.cpp @@ -276,7 +276,7 @@ getAnalysis(), getAnalysis()); - calculateSpillWeightsAndHints(*LIS, *MF, + calculateSpillWeightsAndHints(*LIS, *MF, VRM, getAnalysis(), getAnalysis()); Index: llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp +++ llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp @@ -2586,7 +2586,7 @@ initializeCSRCost(); - calculateSpillWeightsAndHints(*LIS, mf, *Loops, *MBFI); + calculateSpillWeightsAndHints(*LIS, mf, VRM, *Loops, *MBFI); DEBUG(LIS->dump()); Index: llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp =================================================================== --- llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp +++ llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp @@ -724,11 +724,11 @@ MachineBlockFrequencyInfo &MBFI = getAnalysis(); - calculateSpillWeightsAndHints(LIS, MF, getAnalysis(), MBFI, - normalizePBQPSpillWeight); - VirtRegMap &VRM = getAnalysis(); + calculateSpillWeightsAndHints(LIS, MF, &VRM, getAnalysis(), + MBFI, normalizePBQPSpillWeight); + std::unique_ptr VRegSpiller(createInlineSpiller(*this, MF, VRM)); MF.getRegInfo().freezeReservedRegs(MF); Index: llvm/trunk/test/CodeGen/X86/pr24139.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr24139.ll +++ llvm/trunk/test/CodeGen/X86/pr24139.ll @@ -0,0 +1,148 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s + +; Check that we do not get excessive spilling from splitting of constant live ranges. + +; CHECK-LABEL: PR24139: +; CHECK: # 16-byte Spill +; CHECK-NOT: # 16-byte Spill +; CHECK: retq + +define <2 x double> @PR24139(<2 x double> %arg, <2 x double> %arg1, <2 x double> %arg2) { + %tmp = bitcast <2 x double> %arg to <4 x float> + %tmp3 = fmul <4 x float> %tmp, + %tmp4 = bitcast <2 x double> %arg to <4 x i32> + %tmp5 = and <4 x i32> %tmp4, + %tmp6 = or <4 x i32> %tmp5, + %tmp7 = bitcast <4 x i32> %tmp6 to <4 x float> + %tmp8 = fadd <4 x float> %tmp3, %tmp7 + %tmp9 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp8) #2 + %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> + %tmp11 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp9) #2 + %tmp12 = fmul <4 x float> %tmp11, + %tmp13 = fsub <4 x float> %tmp, %tmp12 + %tmp14 = fmul <4 x float> %tmp11, + %tmp15 = fsub <4 x float> %tmp13, %tmp14 + %tmp16 = fmul <4 x float> %tmp15, %tmp15 + %tmp17 = fmul <4 x float> %tmp15, %tmp16 + %tmp18 = fmul <4 x float> %tmp16, + %tmp19 = fadd <4 x float> %tmp18, + %tmp20 = fmul <4 x float> %tmp16, + %tmp21 = fadd <4 x float> %tmp20, + %tmp22 = fmul <4 x float> %tmp16, %tmp19 + %tmp23 = fadd <4 x float> %tmp22, + %tmp24 = fmul <4 x float> %tmp16, %tmp21 + %tmp25 = fadd <4 x float> %tmp24, + %tmp26 = fmul <4 x float> %tmp16, %tmp23 + %tmp27 = fadd <4 x float> %tmp26, + %tmp28 = fmul <4 x float> %tmp17, %tmp25 + %tmp29 = fadd <4 x float> %tmp15, %tmp28 + %tmp30 = and <2 x i64> %tmp10, + %tmp31 = bitcast <2 x i64> %tmp30 to <4 x i32> + %tmp32 = icmp eq <4 x i32> %tmp31, zeroinitializer + %tmp33 = sext <4 x i1> %tmp32 to <4 x i32> + %tmp34 = bitcast <4 x i32> %tmp33 to <4 x float> + %tmp35 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp27, <4 x float> %tmp29, <4 x float> %tmp34) #2 + %tmp36 = and <2 x i64> %tmp10, + %tmp37 = bitcast <2 x i64> %tmp36 to <4 x i32> + %tmp38 = icmp eq <4 x i32> %tmp37, zeroinitializer + %tmp39 = sext <4 x i1> %tmp38 to <4 x i32> + %tmp40 = bitcast <4 x float> %tmp35 to <4 x i32> + %tmp41 = xor <4 x i32> %tmp40, + %tmp42 = bitcast <4 x i32> %tmp41 to <4 x float> + %tmp43 = bitcast <4 x i32> %tmp39 to <4 x float> + %tmp44 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp42, <4 x float> %tmp35, <4 x float> %tmp43) #2 + %tmp45 = bitcast <2 x double> %arg1 to <4 x float> + %tmp46 = fmul <4 x float> %tmp45, + %tmp47 = bitcast <2 x double> %arg1 to <4 x i32> + %tmp48 = and <4 x i32> %tmp47, + %tmp49 = or <4 x i32> %tmp48, + %tmp50 = bitcast <4 x i32> %tmp49 to <4 x float> + %tmp51 = fadd <4 x float> %tmp46, %tmp50 + %tmp52 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp51) #2 + %tmp53 = bitcast <4 x i32> %tmp52 to <2 x i64> + %tmp54 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp52) #2 + %tmp55 = fmul <4 x float> %tmp54, + %tmp56 = fsub <4 x float> %tmp45, %tmp55 + %tmp57 = fmul <4 x float> %tmp54, + %tmp58 = fsub <4 x float> %tmp56, %tmp57 + %tmp59 = fmul <4 x float> %tmp58, %tmp58 + %tmp60 = fmul <4 x float> %tmp58, %tmp59 + %tmp61 = fmul <4 x float> %tmp59, + %tmp62 = fadd <4 x float> %tmp61, + %tmp63 = fmul <4 x float> %tmp59, + %tmp64 = fadd <4 x float> %tmp63, + %tmp65 = fmul <4 x float> %tmp59, %tmp62 + %tmp66 = fadd <4 x float> %tmp65, + %tmp67 = fmul <4 x float> %tmp59, %tmp64 + %tmp68 = fadd <4 x float> %tmp67, + %tmp69 = fmul <4 x float> %tmp59, %tmp66 + %tmp70 = fadd <4 x float> %tmp69, + %tmp71 = fmul <4 x float> %tmp60, %tmp68 + %tmp72 = fadd <4 x float> %tmp58, %tmp71 + %tmp73 = and <2 x i64> %tmp53, + %tmp74 = bitcast <2 x i64> %tmp73 to <4 x i32> + %tmp75 = icmp eq <4 x i32> %tmp74, zeroinitializer + %tmp76 = sext <4 x i1> %tmp75 to <4 x i32> + %tmp77 = bitcast <4 x i32> %tmp76 to <4 x float> + %tmp78 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp70, <4 x float> %tmp72, <4 x float> %tmp77) #2 + %tmp79 = and <2 x i64> %tmp53, + %tmp80 = bitcast <2 x i64> %tmp79 to <4 x i32> + %tmp81 = icmp eq <4 x i32> %tmp80, zeroinitializer + %tmp82 = sext <4 x i1> %tmp81 to <4 x i32> + %tmp83 = bitcast <4 x float> %tmp78 to <4 x i32> + %tmp84 = xor <4 x i32> %tmp83, + %tmp85 = bitcast <4 x i32> %tmp84 to <4 x float> + %tmp86 = bitcast <4 x i32> %tmp82 to <4 x float> + %tmp87 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp85, <4 x float> %tmp78, <4 x float> %tmp86) #2 + %tmp88 = fadd <4 x float> %tmp44, %tmp87 + %tmp89 = bitcast <2 x double> %arg2 to <4 x float> + %tmp90 = fmul <4 x float> %tmp89, + %tmp91 = bitcast <2 x double> %arg2 to <4 x i32> + %tmp92 = and <4 x i32> %tmp91, + %tmp93 = or <4 x i32> %tmp92, + %tmp94 = bitcast <4 x i32> %tmp93 to <4 x float> + %tmp95 = fadd <4 x float> %tmp90, %tmp94 + %tmp96 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp95) #2 + %tmp97 = bitcast <4 x i32> %tmp96 to <2 x i64> + %tmp98 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp96) #2 + %tmp99 = fmul <4 x float> %tmp98, + %tmp100 = fsub <4 x float> %tmp89, %tmp99 + %tmp101 = fmul <4 x float> %tmp98, + %tmp102 = fsub <4 x float> %tmp100, %tmp101 + %tmp103 = fmul <4 x float> %tmp102, %tmp102 + %tmp104 = fmul <4 x float> %tmp102, %tmp103 + %tmp105 = fmul <4 x float> %tmp103, + %tmp106 = fadd <4 x float> %tmp105, + %tmp107 = fmul <4 x float> %tmp103, + %tmp108 = fadd <4 x float> %tmp107, + %tmp109 = fmul <4 x float> %tmp103, %tmp106 + %tmp110 = fadd <4 x float> %tmp109, + %tmp111 = fmul <4 x float> %tmp103, %tmp108 + %tmp112 = fadd <4 x float> %tmp111, + %tmp113 = fmul <4 x float> %tmp103, %tmp110 + %tmp114 = fadd <4 x float> %tmp113, + %tmp115 = fmul <4 x float> %tmp104, %tmp112 + %tmp116 = fadd <4 x float> %tmp102, %tmp115 + %tmp117 = and <2 x i64> %tmp97, + %tmp118 = bitcast <2 x i64> %tmp117 to <4 x i32> + %tmp119 = icmp eq <4 x i32> %tmp118, zeroinitializer + %tmp120 = sext <4 x i1> %tmp119 to <4 x i32> + %tmp121 = bitcast <4 x i32> %tmp120 to <4 x float> + %tmp122 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp114, <4 x float> %tmp116, <4 x float> %tmp121) #2 + %tmp123 = and <2 x i64> %tmp97, + %tmp124 = bitcast <2 x i64> %tmp123 to <4 x i32> + %tmp125 = icmp eq <4 x i32> %tmp124, zeroinitializer + %tmp126 = sext <4 x i1> %tmp125 to <4 x i32> + %tmp127 = bitcast <4 x float> %tmp122 to <4 x i32> + %tmp128 = xor <4 x i32> %tmp127, + %tmp129 = bitcast <4 x i32> %tmp128 to <4 x float> + %tmp130 = bitcast <4 x i32> %tmp126 to <4 x float> + %tmp131 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp129, <4 x float> %tmp122, <4 x float> %tmp130) #2 + %tmp132 = fadd <4 x float> %tmp88, %tmp131 + %tmp133 = bitcast <4 x float> %tmp132 to <2 x double> + ret <2 x double> %tmp133 +} + +declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) +declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)