diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -6784,6 +6784,9 @@ // Allocate to a register if possible, or else a stack slot. Register Reg; + unsigned StoreSizeBytes = XLen / 8; + Align StackAlign = Align(XLen / 8); + if (ValVT == MVT::f16 && !UseGPRForF16_F32) Reg = State.AllocateReg(ArgFPR16s); else if (ValVT == MVT::f32 && !UseGPRForF16_F32) @@ -6818,15 +6821,25 @@ // but we're using all of them. if (IsRet) return true; - LocInfo = CCValAssign::Indirect; // Try using a GPR to pass the address - Reg = State.AllocateReg(ArgGPRs); - LocVT = XLenVT; + if ((Reg = State.AllocateReg(ArgGPRs))) { + LocVT = XLenVT; + LocInfo = CCValAssign::Indirect; + } else if (ValVT.isScalableVector()) { + report_fatal_error("Unable to pass scalable vector types on the stack"); + } else { + // Pass fixed-length vectors on the stack. + LocVT = ValVT; + StoreSizeBytes = ValVT.getStoreSize(); + StackAlign = Align(ValVT.getScalarSizeInBits() / 8); + } } - } else + } else { Reg = State.AllocateReg(ArgGPRs); + } + unsigned StackOffset = - Reg ? 0 : State.AllocateStack(XLen / 8, Align(XLen / 8)); + Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign); // If we reach this point and PendingLocs is non-empty, we must be at the // end of a split argument that must be passed indirectly. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll @@ -1168,3 +1168,221 @@ %r = call <32 x i32> @split_vector_args(<2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <32 x i32> %b, <32 x i32> %b) ret <32 x i32> %r } + +; A rather pathological test case in which we exhaust all vector registers and +; all scalar registers, forcing %z and %8 to go through the stack. +define <32 x i32> @vector_arg_via_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8) { +; LMULMAX8-LABEL: vector_arg_via_stack: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v16, (sp) +; LMULMAX8-NEXT: vadd.vv v8, v8, v16 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: vector_arg_via_stack: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a0, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (sp) +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: vle32.v v16, (a0) +; LMULMAX4-NEXT: vadd.vv v8, v8, v28 +; LMULMAX4-NEXT: vadd.vv v12, v12, v16 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: vector_arg_via_stack: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a0, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (sp) +; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: vle32.v v28, (a0) +; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: vle32.v v30, (a0) +; LMULMAX2-NEXT: addi a0, sp, 96 +; LMULMAX2-NEXT: vle32.v v16, (a0) +; LMULMAX2-NEXT: vadd.vv v8, v8, v26 +; LMULMAX2-NEXT: vadd.vv v10, v10, v28 +; LMULMAX2-NEXT: vadd.vv v12, v12, v30 +; LMULMAX2-NEXT: vadd.vv v14, v14, v16 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: vector_arg_via_stack: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: addi a0, sp, 112 +; LMULMAX1-NEXT: vle32.v v25, (a0) +; LMULMAX1-NEXT: addi a0, sp, 96 +; LMULMAX1-NEXT: vle32.v v26, (a0) +; LMULMAX1-NEXT: addi a0, sp, 80 +; LMULMAX1-NEXT: vle32.v v27, (a0) +; LMULMAX1-NEXT: addi a0, sp, 64 +; LMULMAX1-NEXT: vle32.v v28, (a0) +; LMULMAX1-NEXT: vle32.v v29, (sp) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vle32.v v30, (a0) +; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: vle32.v v31, (a0) +; LMULMAX1-NEXT: addi a0, sp, 48 +; LMULMAX1-NEXT: vle32.v v16, (a0) +; LMULMAX1-NEXT: vadd.vv v8, v8, v29 +; LMULMAX1-NEXT: vadd.vv v9, v9, v30 +; LMULMAX1-NEXT: vadd.vv v10, v10, v31 +; LMULMAX1-NEXT: vadd.vv v11, v11, v16 +; LMULMAX1-NEXT: vadd.vv v12, v12, v28 +; LMULMAX1-NEXT: vadd.vv v13, v13, v27 +; LMULMAX1-NEXT: vadd.vv v14, v14, v26 +; LMULMAX1-NEXT: vadd.vv v15, v15, v25 +; LMULMAX1-NEXT: ret + %s = add <32 x i32> %x, %z + ret <32 x i32> %s +} + +; Calling the function above. Ensure we pass the arguments correctly. +define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { +; LMULMAX8-LABEL: pass_vector_arg_via_stack: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -144 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 144 +; LMULMAX8-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: vmv.v.i v8, 0 +; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: addi a0, zero, 8 +; LMULMAX8-NEXT: addi a1, zero, 1 +; LMULMAX8-NEXT: addi a2, zero, 2 +; LMULMAX8-NEXT: addi a3, zero, 3 +; LMULMAX8-NEXT: addi a4, zero, 4 +; LMULMAX8-NEXT: addi a5, zero, 5 +; LMULMAX8-NEXT: addi a6, zero, 6 +; LMULMAX8-NEXT: addi a7, zero, 7 +; LMULMAX8-NEXT: sd a0, 128(sp) +; LMULMAX8-NEXT: mv a0, zero +; LMULMAX8-NEXT: vmv8r.v v16, v8 +; LMULMAX8-NEXT: call vector_arg_via_stack@plt +; LMULMAX8-NEXT: ld ra, 136(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 144 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: pass_vector_arg_via_stack: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -144 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 144 +; LMULMAX4-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: addi a0, zero, 8 +; LMULMAX4-NEXT: sd a0, 128(sp) +; LMULMAX4-NEXT: vsetivli a0, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 +; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: addi a1, zero, 1 +; LMULMAX4-NEXT: addi a2, zero, 2 +; LMULMAX4-NEXT: addi a3, zero, 3 +; LMULMAX4-NEXT: addi a4, zero, 4 +; LMULMAX4-NEXT: addi a5, zero, 5 +; LMULMAX4-NEXT: addi a6, zero, 6 +; LMULMAX4-NEXT: addi a7, zero, 7 +; LMULMAX4-NEXT: vse32.v v8, (a0) +; LMULMAX4-NEXT: mv a0, zero +; LMULMAX4-NEXT: vmv4r.v v12, v8 +; LMULMAX4-NEXT: vmv4r.v v16, v8 +; LMULMAX4-NEXT: vmv4r.v v20, v8 +; LMULMAX4-NEXT: call vector_arg_via_stack@plt +; LMULMAX4-NEXT: ld ra, 136(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 144 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: pass_vector_arg_via_stack: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -144 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 144 +; LMULMAX2-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: .cfi_offset ra, -8 +; LMULMAX2-NEXT: addi a0, zero, 8 +; LMULMAX2-NEXT: sd a0, 128(sp) +; LMULMAX2-NEXT: vsetivli a0, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vmv.v.i v8, 0 +; LMULMAX2-NEXT: vse32.v v8, (sp) +; LMULMAX2-NEXT: addi a0, sp, 96 +; LMULMAX2-NEXT: vse32.v v8, (a0) +; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: vse32.v v8, (a0) +; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: addi a1, zero, 1 +; LMULMAX2-NEXT: addi a2, zero, 2 +; LMULMAX2-NEXT: addi a3, zero, 3 +; LMULMAX2-NEXT: addi a4, zero, 4 +; LMULMAX2-NEXT: addi a5, zero, 5 +; LMULMAX2-NEXT: addi a6, zero, 6 +; LMULMAX2-NEXT: addi a7, zero, 7 +; LMULMAX2-NEXT: vse32.v v8, (a0) +; LMULMAX2-NEXT: mv a0, zero +; LMULMAX2-NEXT: vmv2r.v v10, v8 +; LMULMAX2-NEXT: vmv2r.v v12, v8 +; LMULMAX2-NEXT: vmv2r.v v14, v8 +; LMULMAX2-NEXT: vmv2r.v v16, v8 +; LMULMAX2-NEXT: vmv2r.v v18, v8 +; LMULMAX2-NEXT: vmv2r.v v20, v8 +; LMULMAX2-NEXT: vmv2r.v v22, v8 +; LMULMAX2-NEXT: call vector_arg_via_stack@plt +; LMULMAX2-NEXT: ld ra, 136(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 144 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: pass_vector_arg_via_stack: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -144 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 144 +; LMULMAX1-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: .cfi_offset ra, -8 +; LMULMAX1-NEXT: addi a0, zero, 8 +; LMULMAX1-NEXT: sd a0, 128(sp) +; LMULMAX1-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 +; LMULMAX1-NEXT: vse32.v v8, (sp) +; LMULMAX1-NEXT: addi a0, sp, 112 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, sp, 96 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, sp, 80 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, sp, 64 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, sp, 48 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: addi a1, zero, 1 +; LMULMAX1-NEXT: addi a2, zero, 2 +; LMULMAX1-NEXT: addi a3, zero, 3 +; LMULMAX1-NEXT: addi a4, zero, 4 +; LMULMAX1-NEXT: addi a5, zero, 5 +; LMULMAX1-NEXT: addi a6, zero, 6 +; LMULMAX1-NEXT: addi a7, zero, 7 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: mv a0, zero +; LMULMAX1-NEXT: vmv1r.v v9, v8 +; LMULMAX1-NEXT: vmv1r.v v10, v8 +; LMULMAX1-NEXT: vmv1r.v v11, v8 +; LMULMAX1-NEXT: vmv1r.v v12, v8 +; LMULMAX1-NEXT: vmv1r.v v13, v8 +; LMULMAX1-NEXT: vmv1r.v v14, v8 +; LMULMAX1-NEXT: vmv1r.v v15, v8 +; LMULMAX1-NEXT: vmv1r.v v16, v8 +; LMULMAX1-NEXT: vmv1r.v v17, v8 +; LMULMAX1-NEXT: vmv1r.v v18, v8 +; LMULMAX1-NEXT: vmv1r.v v19, v8 +; LMULMAX1-NEXT: vmv1r.v v20, v8 +; LMULMAX1-NEXT: vmv1r.v v21, v8 +; LMULMAX1-NEXT: vmv1r.v v22, v8 +; LMULMAX1-NEXT: vmv1r.v v23, v8 +; LMULMAX1-NEXT: call vector_arg_via_stack@plt +; LMULMAX1-NEXT: ld ra, 136(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 144 +; LMULMAX1-NEXT: ret + %s = call <32 x i32> @vector_arg_via_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 8) + ret <32 x i32> %s +} diff --git a/llvm/test/CodeGen/RISCV/rvv/unsupported-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/unsupported-calling-conv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/unsupported-calling-conv.ll @@ -0,0 +1,11 @@ +; RUN: not --crash llc -mtriple=riscv64 -mattr=+experimental-v < %s 2>&1 | FileCheck %s + +; A rather pathological test case in which we exhaust all vector registers and +; all scalar registers, forcing %z to go through the stack. This is not yet +; supported, so check that a reasonable error message is produced rather than +; hitting an assertion or producing incorrect code. +; CHECK: LLVM ERROR: Unable to pass scalable vector types on the stack +define @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, %x, %y, %z) { + %s = add %x, %z + ret %s +}