Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1163,7 +1163,7 @@ // register scavenging. If we already spilled an extra callee-saved register // above to keep the number of spills even, we don't need to do anything else // here. - if (BigStack && !ExtraCSSpill) { + if ((BigStack || AFI->hasNonRISpills()) && !ExtraCSSpill) { if (UnspilledCSGPR != AArch64::NoRegister) { DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) << " to get a scratch register.\n"); Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" @@ -2424,6 +2425,8 @@ if (Offset) MI.addImm(0); + else + MF.getInfo()->setHasNonRISpills(); MI.addMemOperand(MMO); } @@ -2527,6 +2530,8 @@ .addFrameIndex(FI); if (Offset) MI.addImm(0); + else + MF.getInfo()->setHasNonRISpills(); MI.addMemOperand(MMO); } Index: lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -88,6 +88,10 @@ /// other stack allocations. bool CalleeSaveStackHasFreeSpace = false; + /// True if the function has spills using instructions with no immediate + /// offset field. + bool HasNonRISpills = false; + public: AArch64FunctionInfo() = default; @@ -179,6 +183,9 @@ LOHRelated.insert(Args.begin(), Args.end()); } + void setHasNonRISpills() { HasNonRISpills = true; } + bool hasNonRISpills() const { return HasNonRISpills; } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; Index: test/CodeGen/AArch64/emergency_spill_slot_nonri.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/emergency_spill_slot_nonri.ll @@ -0,0 +1,202 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -misched-cutoff=0 < %s + +; Make sure that the emergency spill slot is created when we spill using an +; instruction that doesn't have base+imm form. If we cannot scavenege a spare +; register when fixing up the frame index, then we need the emergency spill slot +; to which to spill a register. +; Test case taken from arm64-spill-lr.ll and modified a little to require a spill +; using st1, where the estimated stack size doesn't exceed 256 bytes. + +@bar = common global i32 0, align 4 +@bar2 = common global i32 0, align 4 + +define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind { +entry: + %stack = alloca [1 x i32], align 4 + %0 = bitcast [1 x i32]* %stack to i8* + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds [1 x i32], [1 x i32]* %stack, i64 0, i64 %idxprom + store i32 %b, i32* %arrayidx, align 4 + %1 = load volatile i32, i32* @bar, align 4 + %2 = load volatile i32, i32* @bar, align 4 + %3 = load volatile i32, i32* @bar, align 4 + %4 = load volatile i32, i32* @bar, align 4 + %5 = load volatile i32, i32* @bar, align 4 + %6 = load volatile i32, i32* @bar, align 4 + %7 = load volatile i32, i32* @bar, align 4 + %8 = load volatile i32, i32* @bar, align 4 + %9 = load volatile i32, i32* @bar, align 4 + %10 = load volatile i32, i32* @bar, align 4 + %idx0 = bitcast i32* @bar to <4 x i32>* + %vld0 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx0) + %idx1temp = getelementptr inbounds i32, i32* @bar, i64 1 + %idx1 = bitcast i32* %idx1temp to <4 x i32>* + %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx1) + %idx2temp = getelementptr inbounds i32, i32* @bar, i64 2 + %idx2 = bitcast i32* %idx2temp to <4 x i32>* + %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx2) + %idx3temp = getelementptr inbounds i32, i32* @bar, i64 3 + %idx3 = bitcast i32* %idx3temp to <4 x i32>* + %vld3 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx3) + %idx4temp = getelementptr inbounds i32, i32* @bar, i64 4 + %idx4 = bitcast i32* %idx4temp to <4 x i32>* + %vld4 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx4) + %idx5temp = getelementptr inbounds i32, i32* @bar, i64 5 + %idx5 = bitcast i32* %idx5temp to <4 x i32>* + %vld5 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx5) + %idx6temp = getelementptr inbounds i32, i32* @bar, i64 6 + %idx6 = bitcast i32* %idx6temp to <4 x i32>* + %vld6 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx6) + %idx7temp = getelementptr inbounds i32, i32* @bar, i64 7 + %idx7 = bitcast i32* %idx7temp to <4 x i32>* + %vld7 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx7) + %idx8temp = getelementptr inbounds i32, i32* @bar, i64 8 + %idx8 = bitcast i32* %idx8temp to <4 x i32>* + %vld8 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx8) + %idx9temp = getelementptr inbounds i32, i32* @bar, i64 9 + %idx9 = bitcast i32* %idx9temp to <4 x i32>* + %vld9 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx9) + %idx10temp = getelementptr inbounds i32, i32* @bar, i64 10 + %idx10 = bitcast i32* %idx10temp to <4 x i32>* + %vld10 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx10) + %idx11temp = getelementptr inbounds i32, i32* @bar, i64 11 + %idx11 = bitcast i32* %idx11temp to <4 x i32>* + %vld11 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx11) + %idx12temp = getelementptr inbounds i32, i32* @bar, i64 12 + %idx12 = bitcast i32* %idx12temp to <4 x i32>* + %vld12 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx12) + %idx13temp = getelementptr inbounds i32, i32* @bar, i64 13 + %idx13 = bitcast i32* %idx13temp to <4 x i32>* + %vld13 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx13) + %idx14temp = getelementptr inbounds i32, i32* @bar, i64 14 + %idx14 = bitcast i32* %idx14temp to <4 x i32>* + %vld14 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx14) + %idx15temp = getelementptr inbounds i32, i32* @bar, i64 15 + %idx15 = bitcast i32* %idx15temp to <4 x i32>* + %vld15 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx15) + %idx16temp = getelementptr inbounds i32, i32* @bar, i64 16 + %idx16 = bitcast i32* %idx16temp to <4 x i32>* + %vld16 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* %idx16) + %vld0.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld0, 0 + %vld0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld0, 1 + %sidx0 = bitcast i32* @bar2 to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld0.fca.0.extract, <4 x i32> %vld0.extract, i64 1, i8* %sidx0) + %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0 + %vld1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 1 + %sidx1temp = getelementptr inbounds i32, i32* @bar2, i64 1 + %sidx1 = bitcast i32* %sidx1temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld1.fca.0.extract, <4 x i32> %vld1.extract, i64 1, i8* %sidx1) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %sidx2temp = getelementptr inbounds i32, i32* @bar2, i64 2 + %sidx2 = bitcast i32* %sidx2temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.extract, i64 1, i8* %sidx2) + %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld3, 0 + %vld3.extract = extractvalue { <4 x i32>, <4 x i32> } %vld3, 1 + %sidx3temp = getelementptr inbounds i32, i32* @bar2, i64 3 + %sidx3 = bitcast i32* %sidx3temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.extract, i64 1, i8* %sidx3) + %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld4, 0 + %vld4.extract = extractvalue { <4 x i32>, <4 x i32> } %vld4, 1 + %sidx4temp = getelementptr inbounds i32, i32* @bar2, i64 4 + %sidx4 = bitcast i32* %sidx4temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld4.fca.0.extract, <4 x i32> %vld4.extract, i64 1, i8* %sidx4) + %vld5.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld5, 0 + %vld5.extract = extractvalue { <4 x i32>, <4 x i32> } %vld5, 1 + %sidx5temp = getelementptr inbounds i32, i32* @bar2, i64 5 + %sidx5 = bitcast i32* %sidx5temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld5.fca.0.extract, <4 x i32> %vld5.extract, i64 1, i8* %sidx5) + %vld6.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld6, 0 + %vld6.extract = extractvalue { <4 x i32>, <4 x i32> } %vld6, 1 + %sidx6temp = getelementptr inbounds i32, i32* @bar2, i64 6 + %sidx6 = bitcast i32* %sidx6temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld6.fca.0.extract, <4 x i32> %vld6.extract, i64 1, i8* %sidx6) + %vld7.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld7, 0 + %vld7.extract = extractvalue { <4 x i32>, <4 x i32> } %vld7, 1 + %sidx7temp = getelementptr inbounds i32, i32* @bar2, i64 7 + %sidx7 = bitcast i32* %sidx7temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld7.fca.0.extract, <4 x i32> %vld7.extract, i64 1, i8* %sidx7) + %vld8.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld8, 0 + %vld8.extract = extractvalue { <4 x i32>, <4 x i32> } %vld8, 1 + %sidx8temp = getelementptr inbounds i32, i32* @bar2, i64 8 + %sidx8 = bitcast i32* %sidx8temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld8.fca.0.extract, <4 x i32> %vld8.extract, i64 1, i8* %sidx8) + %vld9.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld9, 0 + %vld9.extract = extractvalue { <4 x i32>, <4 x i32> } %vld9, 1 + %sidx9temp = getelementptr inbounds i32, i32* @bar2, i64 9 + %sidx9 = bitcast i32* %sidx9temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld9.fca.0.extract, <4 x i32> %vld9.extract, i64 1, i8* %sidx9) + %vld10.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld10, 0 + %vld10.extract = extractvalue { <4 x i32>, <4 x i32> } %vld10, 1 + %sidx10temp = getelementptr inbounds i32, i32* @bar2, i64 10 + %sidx10 = bitcast i32* %sidx10temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld10.fca.0.extract, <4 x i32> %vld10.extract, i64 1, i8* %sidx10) + %vld11.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld11, 0 + %vld11.extract = extractvalue { <4 x i32>, <4 x i32> } %vld11, 1 + %sidx11temp = getelementptr inbounds i32, i32* @bar2, i64 11 + %sidx11 = bitcast i32* %sidx11temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld11.fca.0.extract, <4 x i32> %vld11.extract, i64 1, i8* %sidx11) + %vld12.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld12, 0 + %vld12.extract = extractvalue { <4 x i32>, <4 x i32> } %vld12, 1 + %sidx12temp = getelementptr inbounds i32, i32* @bar2, i64 12 + %sidx12 = bitcast i32* %sidx12temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld12.fca.0.extract, <4 x i32> %vld12.extract, i64 1, i8* %sidx12) + %vld13.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld13, 0 + %vld13.extract = extractvalue { <4 x i32>, <4 x i32> } %vld13, 1 + %sidx13temp = getelementptr inbounds i32, i32* @bar2, i64 13 + %sidx13 = bitcast i32* %sidx13temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld13.fca.0.extract, <4 x i32> %vld13.extract, i64 1, i8* %sidx13) + %vld14.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld14, 0 + %vld14.extract = extractvalue { <4 x i32>, <4 x i32> } %vld14, 1 + %sidx14temp = getelementptr inbounds i32, i32* @bar2, i64 14 + %sidx14 = bitcast i32* %sidx14temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld14.fca.0.extract, <4 x i32> %vld14.extract, i64 1, i8* %sidx14) + %vld15.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld15, 0 + %vld15.extract = extractvalue { <4 x i32>, <4 x i32> } %vld15, 1 + %sidx15temp = getelementptr inbounds i32, i32* @bar2, i64 15 + %sidx15 = bitcast i32* %sidx15temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld15.fca.0.extract, <4 x i32> %vld15.extract, i64 1, i8* %sidx15) + %vld16.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld16, 0 + %vld16.extract = extractvalue { <4 x i32>, <4 x i32> } %vld16, 1 + %sidx16temp = getelementptr inbounds i32, i32* @bar2, i64 16 + %sidx16 = bitcast i32* %sidx16temp to i8* + tail call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> %vld16.fca.0.extract, <4 x i32> %vld16.extract, i64 1, i8* %sidx16) + %idxprom1 = sext i32 %c to i64 + %arrayidx2 = getelementptr inbounds [1 x i32], [1 x i32]* %stack, i64 0, i64 %idxprom1 + %11 = load i32, i32* %arrayidx2, align 4 + %factor = mul i32 %h, -2 + %factor67 = mul i32 %g, -2 + %factor68 = mul i32 %f, -2 + %factor69 = mul i32 %e, -2 + %factor70 = mul i32 %d, -2 + %factor71 = mul i32 %c, -2 + %factor72 = mul i32 %b, -2 + %sum = add i32 %2, %1 + %sum73 = add i32 %sum, %3 + %sum74 = add i32 %sum73, %4 + %sum75 = add i32 %sum74, %5 + %sum76 = add i32 %sum75, %6 + %sum77 = add i32 %sum76, %7 + %sum78 = add i32 %sum77, %8 + %sum79 = add i32 %sum78, %9 + %sum80 = add i32 %sum79, %10 + %sub15 = sub i32 %11, %sum80 + %sub16 = add i32 %sub15, %factor + %sub17 = add i32 %sub16, %factor67 + %sub18 = add i32 %sub17, %factor68 + %sub19 = add i32 %sub18, %factor69 + %sub20 = add i32 %sub19, %factor70 + %sub21 = add i32 %sub20, %factor71 + %add = add i32 %sub21, %factor72 + ret i32 %add +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>*) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32>, <4 x i32>, i64, i8* nocapture) #2 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind readonly } +attributes #2 = { argmemonly nounwind }