Index: lib/CodeGen/RegisterScavenging.cpp =================================================================== --- lib/CodeGen/RegisterScavenging.cpp +++ lib/CodeGen/RegisterScavenging.cpp @@ -392,11 +392,35 @@ return SReg; } - // Find an available scavenging slot. - unsigned SI; - for (SI = 0; SI < Scavenged.size(); ++SI) - if (Scavenged[SI].Reg == 0) - break; + // Find an available scavenging slot with size and alignment matching + // the requirements of the class RC. + MachineFunction &MF = *I->getParent()->getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned NeedSize = RC->getSize(); + unsigned NeedAlign = RC->getAlignment(); + + unsigned SI = Scavenged.size(), Diff = UINT_MAX; + for (unsigned I = 0; I < Scavenged.size(); ++I) { + if (Scavenged[I].Reg != 0) + continue; + // Verify that this slot is valid for this register. + int FI = Scavenged[I].FrameIndex; + unsigned S = MFI.getObjectSize(FI); + unsigned A = MFI.getObjectAlignment(FI); + if (NeedSize > S || NeedAlign > A) + continue; + // Avoid wasting slots with large size and/or large alignment. Pick one + // that is the best fit for this register class (in street metric). + // Picking a larger slot than necessary could happen if a slot for a + // larger register is reserved before a slot for a smaller one. When + // trying to spill a smaller register, the large slot would be found + // first, thus making it impossible to spill the larger register later. + unsigned D = (S-NeedSize) + (A-NeedAlign); + if (D < Diff) { + SI = I; + Diff = D; + } + } if (SI == Scavenged.size()) { // We need to scavenge a register but have no spill slot, the target @@ -411,8 +435,12 @@ // otherwise, use the emergency stack spill slot. if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) { // Spill the scavenged register before I. - assert(Scavenged[SI].FrameIndex >= 0 && - "Cannot scavenge register without an emergency spill slot!"); + if (Scavenged[SI].FrameIndex < 0) { + Twine Msg = Twine("Error while trying to spill ") + TRI->getName(SReg) + + " from class " + TRI->getRegClassName(RC) + + ": Cannot scavenge register without an emergency spill slot!"; + report_fatal_error(Msg); + } TII->storeRegToStackSlot(*MBB, I, SReg, true, Scavenged[SI].FrameIndex, RC, TRI); MachineBasicBlock::iterator II = std::prev(I); Index: test/CodeGen/Hexagon/reg-scavenger-valid-slot.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/reg-scavenger-valid-slot.ll @@ -0,0 +1,138 @@ +; RUN: llc -march=hexagon < %s +; REQUIRES: asserts + +; This testcase tries to force spills of both vector and int registers +; in a function where scavenging slots were reserved for both register +; classes. The original problem was that the scavenger selected an int +; slot (with size/alignment of 4) for a vector register (with size/ +; alignment of 64). This caused an assertion in the assembler printer +; due to an offset in a vector store having unexpected low-order bits. + +; We cannot directly whether the bits appear or not, since they will be +; truncated off by the time we see the output, but we can check that +; we got to the end of the function without crashing. + +; CHECK: endloop +; CHECK: dealloc_return + +target triple = "hexagon" + +define i32 @foo(i32 %n, i32* readonly %pi, i32* nocapture %po, <16 x i32>* readonly %vi, <16 x i32>* nocapture %vo) #0 { +entry: + br i1 undef, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %incdec.ptr63 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 32 + %incdec.ptr61 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 30 + %incdec.ptr60 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 29 + %tmp = load <16 x i32>, <16 x i32>* %incdec.ptr60, align 64 + %incdec.ptr59 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 28 + %tmp1 = load <16 x i32>, <16 x i32>* %incdec.ptr59, align 64 + %incdec.ptr58 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 27 + %tmp2 = load <16 x i32>, <16 x i32>* %incdec.ptr58, align 64 + %incdec.ptr57 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 26 + %tmp3 = load <16 x i32>, <16 x i32>* %incdec.ptr57, align 64 + %incdec.ptr56 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 25 + %tmp4 = load <16 x i32>, <16 x i32>* %incdec.ptr56, align 64 + %tmp5 = load <16 x i32>, <16 x i32>* undef, align 64 + %incdec.ptr40 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi, i32 9 + %tmp6 = load <16 x i32>, <16 x i32>* %incdec.ptr40, align 64 + br i1 undef, label %for.end.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader.new + %po.addr.0289 = phi i32* [ %po, %for.body.preheader.new ], [ %incdec.ptr64.7, %for.body ] + %vi.addr.0288 = phi <16 x i32>* [ %incdec.ptr63, %for.body.preheader.new ], [ %incdec.ptr67.7, %for.body ] + %vo.addr.0287 = phi <16 x i32>* [ %vo, %for.body.preheader.new ], [ %incdec.ptr66.7, %for.body ] + %v31.0.in286 = phi <16 x i32>* [ undef, %for.body.preheader.new ], [ %incdec.ptr67.6, %for.body ] + %v30.0285 = phi <16 x i32> [ zeroinitializer, %for.body.preheader.new ], [ %v31.0.7, %for.body ] + %v29.0284 = phi <16 x i32> [ %tmp, %for.body.preheader.new ], [ %v31.0.6, %for.body ] + %v28.0283 = phi <16 x i32> [ %tmp1, %for.body.preheader.new ], [ undef, %for.body ] + %v27.0282 = phi <16 x i32> [ %tmp2, %for.body.preheader.new ], [ %v31.0.4, %for.body ] + %v26.0281 = phi <16 x i32> [ %tmp3, %for.body.preheader.new ], [ %v31.0.3, %for.body ] + %v25.0280 = phi <16 x i32> [ %tmp4, %for.body.preheader.new ], [ undef, %for.body ] + %v23.0278 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v31.0, %for.body ] + %v22.0277 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v30.0285, %for.body ] + %v21.0276 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v29.0284, %for.body ] + %v20.0275 = phi <16 x i32> [ %tmp5, %for.body.preheader.new ], [ %v28.0283, %for.body ] + %v19.0274 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v27.0282, %for.body ] + %v18.0273 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v26.0281, %for.body ] + %v17.0272 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v25.0280, %for.body ] + %v15.0270 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v23.0278, %for.body ] + %v14.0269 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v22.0277, %for.body ] + %v13.0268 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v21.0276, %for.body ] + %v12.0267 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v20.0275, %for.body ] + %v11.0266 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v19.0274, %for.body ] + %v10.0265 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v18.0273, %for.body ] + %v9.0264 = phi <16 x i32> [ %tmp6, %for.body.preheader.new ], [ %v17.0272, %for.body ] + %v7.0262 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v15.0270, %for.body ] + %v6.0261 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v14.0269, %for.body ] + %v5.0260 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v13.0268, %for.body ] + %v4.0259 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v12.0267, %for.body ] + %v3.0258 = phi <16 x i32> [ undef, %for.body.preheader.new ], [ %v11.0266, %for.body ] + %niter = phi i32 [ undef, %for.body.preheader.new ], [ %niter.nsub.7, %for.body ] + %v31.0 = load <16 x i32>, <16 x i32>* %v31.0.in286, align 64 + %incdec.ptr66.1 = getelementptr inbounds <16 x i32>, <16 x i32>* %vo.addr.0287, i32 2 + store <16 x i32> undef, <16 x i32>* %incdec.ptr66.1, align 64 + %incdec.ptr67.2 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi.addr.0288, i32 3 + %v31.0.3 = load <16 x i32>, <16 x i32>* undef, align 64 + %incdec.ptr64.3 = getelementptr inbounds i32, i32* %po.addr.0289, i32 4 + %tmp7 = tail call <512 x i1> @llvm.hexagon.V6.vgtw(<16 x i32> undef, <16 x i32> %v31.0) + %tmp8 = tail call <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1> %tmp7, <16 x i32> %v15.0270, <16 x i32> %v31.0) + %tmp9 = tail call <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1> %tmp8, <16 x i32> %v23.0278, <16 x i32> undef) + %tmp10 = tail call <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1> %tmp9, <16 x i32> %v4.0259, <16 x i32> %v3.0258) + %tmp11 = tail call <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1> %tmp7, <16 x i32> %v3.0258, <16 x i32> %v5.0260) + %tmp12 = tail call <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1> %tmp8, <16 x i32> %tmp11, <16 x i32> %v6.0261) + %tmp13 = tail call <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1> %tmp9, <16 x i32> %tmp12, <16 x i32> %v7.0262) + %tmp14 = tail call <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1> %tmp10, <16 x i32> %tmp13, <16 x i32> undef) + %incdec.ptr66.3 = getelementptr inbounds <16 x i32>, <16 x i32>* %vo.addr.0287, i32 4 + store <16 x i32> %tmp14, <16 x i32>* undef, align 64 + %incdec.ptr67.3 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi.addr.0288, i32 4 + %v31.0.4 = load <16 x i32>, <16 x i32>* %incdec.ptr67.2, align 64 + %tmp15 = tail call <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1> undef, <16 x i32> undef, <16 x i32> undef) + %tmp16 = tail call <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1> undef, <16 x i32> %tmp15, <16 x i32> %v9.0264) + %tmp17 = tail call <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1> undef, <16 x i32> %tmp16, <16 x i32> %v10.0265) + %incdec.ptr66.5 = getelementptr inbounds <16 x i32>, <16 x i32>* %vo.addr.0287, i32 6 + store <16 x i32> %tmp17, <16 x i32>* undef, align 64 + %incdec.ptr67.5 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi.addr.0288, i32 6 + %v31.0.6 = load <16 x i32>, <16 x i32>* undef, align 64 + %incdec.ptr64.6 = getelementptr inbounds i32, i32* %po.addr.0289, i32 7 + %tmp18 = tail call <512 x i1> @llvm.hexagon.V6.vgtw(<16 x i32> undef, <16 x i32> %v31.0.3) + %tmp19 = tail call <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1> undef, <16 x i32> %v26.0281, <16 x i32> %v27.0282) + %tmp20 = tail call <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1> %tmp19, <16 x i32> %v7.0262, <16 x i32> %v6.0261) + %tmp21 = tail call <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1> %tmp18, <16 x i32> %v6.0261, <16 x i32> undef) + %tmp22 = tail call <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1> undef, <16 x i32> %tmp21, <16 x i32> %v9.0264) + %tmp23 = tail call <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1> %tmp19, <16 x i32> %tmp22, <16 x i32> %v10.0265) + %tmp24 = tail call <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1> %tmp20, <16 x i32> %tmp23, <16 x i32> %v11.0266) + %incdec.ptr66.6 = getelementptr inbounds <16 x i32>, <16 x i32>* %vo.addr.0287, i32 7 + store <16 x i32> %tmp24, <16 x i32>* %incdec.ptr66.5, align 64 + %incdec.ptr67.6 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi.addr.0288, i32 7 + %v31.0.7 = load <16 x i32>, <16 x i32>* %incdec.ptr67.5, align 64 + %incdec.ptr64.7 = getelementptr inbounds i32, i32* %po.addr.0289, i32 8 + store i32 undef, i32* %incdec.ptr64.6, align 4 + %incdec.ptr66.7 = getelementptr inbounds <16 x i32>, <16 x i32>* %vo.addr.0287, i32 8 + store <16 x i32> undef, <16 x i32>* %incdec.ptr66.6, align 64 + %incdec.ptr67.7 = getelementptr inbounds <16 x i32>, <16 x i32>* %vi.addr.0288, i32 8 + %niter.nsub.7 = add i32 %niter, -8 + %niter.ncmp.7 = icmp eq i32 %niter.nsub.7, 0 + br i1 %niter.ncmp.7, label %for.end.loopexit.unr-lcssa, label %for.body + +for.end.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + br i1 undef, label %for.end, label %for.body.epil + +for.body.epil: ; preds = %for.body.epil, %for.end.loopexit.unr-lcssa + br i1 undef, label %for.end, label %for.body.epil + +for.end: ; preds = %for.body.epil, %for.end.loopexit.unr-lcssa, %entry + ret i32 0 +} + +declare <512 x i1> @llvm.hexagon.V6.vgtw(<16 x i32>, <16 x i32>) #1 +declare <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1 +declare <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1>, <16 x i32>, <16 x i32>) #1 +declare <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1>, <16 x i32>, <16 x i32>) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" } +attributes #1 = { nounwind readnone }