diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td
--- a/llvm/lib/Target/VE/VECallingConv.td
+++ b/llvm/lib/Target/VE/VECallingConv.td
@@ -49,6 +49,19 @@
   CCDelegateTo<CC_VE_C_Stack>
 ]>;
 
+// All arguments get passed in stack for varargs function or non-prototyped
+// function.
+def CC_VE2 : CallingConv<[
+  // float --> need special handling like below.
+  //    0      4
+  //    +------+------+
+  //    | empty| float|
+  //    +------+------+
+  CCIfType<[f32], CCCustom<"allocateFloat">>,
+
+  CCAssignToStack<0, 8>
+]>;
+
 def RetCC_VE : CallingConv<[
   // Promote i1/i8/i16 arguments to i32.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -71,6 +71,8 @@
   /// Custom Lower {
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   /// } Custom Lower
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "VEISelLowering.h"
 #include "MCTargetDesc/VEMCExpr.h"
+#include "VEMachineFunctionInfo.h"
 #include "VERegisterInfo.h"
 #include "VETargetMachine.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -203,7 +204,20 @@
                     MachinePointerInfo::getFixedStack(MF, FI)));
   }
 
-  assert(!IsVarArg && "TODO implement var args");
+  if (!IsVarArg)
+    return Chain;
+
+  // This function takes variable arguments, some of which may have been passed
+  // in registers %s0-%s8.
+  //
+  // The va_start intrinsic needs to know the offset to the first variable
+  // argument.
+  // TODO: need to calculate offset correctly once we support f128.
+  unsigned ArgOffset = ArgLocs.size() * 8;
+  VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  // Skip the 176 bytes of register save area.
+  FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
+
   return Chain;
 }
 
@@ -258,7 +272,16 @@
   // by CC_VE would be correct now.
   CCInfo.AnalyzeCallOperands(CLI.Outs, CC_VE);
 
-  assert(!CLI.IsVarArg);
+  // VE requires to use both register and stack for varargs or no-prototyped
+  // functions.
+  bool UseBoth = CLI.IsVarArg;
+
+  // Analyze operands again if it is required to store BOTH.
+  SmallVector<CCValAssign, 16> ArgLocs2;
+  CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
+                  ArgLocs2, *DAG.getContext());
+  if (UseBoth)
+    CCInfo2.AnalyzeCallOperands(CLI.Outs, CC_VE2);
 
   // Get the size of the outgoing arguments stack space requirement.
   unsigned ArgsSize = CCInfo.getNextStackOffset();
@@ -326,7 +349,9 @@
 
     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-      continue;
+      if (!UseBoth)
+        continue;
+      VA = ArgLocs2[i];
     }
 
     assert(VA.isMemLoc());
@@ -521,6 +546,15 @@
   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
 
+  /// VAARG handling {
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  // VAARG needs to be lowered to access with 8 bytes alignment.
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  /// } VAARG handling
+
   // VE has no REM or DIVREM operations.
   for (MVT IntVT : MVT::integer_valuetypes()) {
     setOperationAction(ISD::UREM, IntVT, Expand);
@@ -623,6 +657,66 @@
   return makeAddress(Op, DAG);
 }
 
+SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Need frame address to find the address of VarArgsFrameIndex.
+  MF.getFrameInfo().setFrameAddressIsTaken(true);
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  SDLoc DL(Op);
+  SDValue Offset =
+      DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
+                  DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  EVT PtrVT = VAListPtr.getValueType();
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc DL(Node);
+  SDValue VAList =
+      DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
+  SDValue Chain = VAList.getValue(1);
+  SDValue NextPtr;
+
+  if (VT == MVT::f32) {
+    // float --> need special handling like below.
+    //    0      4
+    //    +------+------+
+    //    | empty| float|
+    //    +------+------+
+    // Increment the pointer, VAList, by 8 to the next vaarg.
+    NextPtr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
+    // Then, adjust VAList.
+    unsigned InternalOffset = 4;
+    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                         DAG.getConstant(InternalOffset, DL, PtrVT));
+  } else {
+    // Increment the pointer, VAList, by 8 to the next vaarg.
+    NextPtr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
+  }
+
+  // Store the incremented VAList to the legalized pointer.
+  InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
+
+  // Load the actual argument out of the pointer VAList.
+  // We can't count on greater alignment than the word size.
+  return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
+                     std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
+}
+
 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
@@ -631,6 +725,10 @@
     return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
   }
 }
 /// } Custom Lower
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -60,6 +60,23 @@
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
 
+  /// Stack Spill & Reload {
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  /// } Stack Spill & Reload
+
   // Lower pseudo instructions after register allocation.
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -298,6 +298,118 @@
   }
 }
 
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+  if (MI.getOpcode() == VE::LDSri || // I64
+      MI.getOpcode() == VE::LDLri || // I32
+      MI.getOpcode() == VE::LDUri    // F32
+  ) {
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                         int &FrameIndex) const {
+  if (MI.getOpcode() == VE::STSri || // I64
+      MI.getOpcode() == VE::STLri || // I32
+      MI.getOpcode() == VE::STUri    // F32
+  ) {
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned SrcReg, bool isKill, int FI,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+  if (RC == &VE::I64RegClass) {
+    BuildMI(MBB, I, DL, get(VE::STSri))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
+  } else if (RC == &VE::I32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::STLri))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
+  } else if (RC == &VE::F32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::STUri))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
+  } else
+    report_fatal_error("Can't store this register to stack slot");
+}
+
+void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+  if (RC == &VE::I64RegClass) {
+    BuildMI(MBB, I, DL, get(VE::LDSri), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO);
+  } else if (RC == &VE::I32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::LDLri), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO);
+  } else if (RC == &VE::F32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::LDUri), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO);
+  } else
+    report_fatal_error("Can't load this register from stack slot");
+}
+
 bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case VE::EXTEND_STACK: {
diff --git a/llvm/lib/Target/VE/VEMachineFunctionInfo.h b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
--- a/llvm/lib/Target/VE/VEMachineFunctionInfo.h
+++ b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
@@ -20,12 +20,19 @@
   virtual void anchor();
 
 private:
+  /// VarArgsFrameOffset - Frame offset to start of varargs area.
+  int VarArgsFrameOffset;
+
   /// IsLeafProc - True if the function is a leaf procedure.
   bool IsLeafProc;
 
 public:
-  VEMachineFunctionInfo() : IsLeafProc(false) {}
-  explicit VEMachineFunctionInfo(MachineFunction &MF) : IsLeafProc(false) {}
+  VEMachineFunctionInfo() : VarArgsFrameOffset(0), IsLeafProc(false) {}
+  explicit VEMachineFunctionInfo(MachineFunction &MF)
+      : VarArgsFrameOffset(0), IsLeafProc(false) {}
+
+  int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+  void setVarArgsFrameOffset(int Offset) { VarArgsFrameOffset = Offset; }
 
   void setLeafProc(bool rhs) { IsLeafProc = rhs; }
   bool isLeafProc() const { return IsLeafProc; }
diff --git a/llvm/test/CodeGen/VE/va_arg.ll b/llvm/test/CodeGen/VE/va_arg.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/va_arg.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
+
+@.str = private unnamed_addr constant [6 x i8] c"a=%d\0A\00", align 1
+@.str.1 = private unnamed_addr constant [6 x i8] c"b=%d\0A\00", align 1
+@.str.2 = private unnamed_addr constant [6 x i8] c"c=%d\0A\00", align 1
+@.str.3 = private unnamed_addr constant [6 x i8] c"d=%u\0A\00", align 1
+@.str.4 = private unnamed_addr constant [6 x i8] c"e=%u\0A\00", align 1
+@.str.5 = private unnamed_addr constant [6 x i8] c"f=%u\0A\00", align 1
+@.str.6 = private unnamed_addr constant [6 x i8] c"g=%f\0A\00", align 1
+@.str.7 = private unnamed_addr constant [6 x i8] c"h=%p\0A\00", align 1
+@.str.8 = private unnamed_addr constant [7 x i8] c"i=%ld\0A\00", align 1
+@.str.9 = private unnamed_addr constant [7 x i8] c"j=%lf\0A\00", align 1
+
+define i32 @func_vainout(i32, ...) {
+; CHECK-LABEL: func_vainout:
+; CHECK:        ldl.sx %s1, 184(,%s9)
+; CHECK:        ld2b.sx %s18, 192(,%s9)
+; CHECK:        ld1b.sx %s19, 200(,%s9)
+; CHECK:        ldl.sx %s20, 208(,%s9)
+; CHECK:        ld2b.zx %s21, 216(,%s9)
+; CHECK:        ld1b.zx %s22, 224(,%s9)
+; CHECK:        ldu %s26, 236(,%s9)
+; CHECK:        ld %s23, 240(,%s9)
+; CHECK:        ld %s24, 248(,%s9)
+; CHECK:        ld %s25, 256(,%s9)
+
+  %a = alloca i8*, align 8
+  %a8 = bitcast i8** %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %a8)
+  call void @llvm.va_start(i8* nonnull %a8)
+  %p0 = va_arg i8** %a, i32
+  %p1 = va_arg i8** %a, i16
+  %p2 = va_arg i8** %a, i8
+  %p3 = va_arg i8** %a, i32
+  %p4 = va_arg i8** %a, i16
+  %p5 = va_arg i8** %a, i8
+  %p6 = va_arg i8** %a, float
+  %p7 = va_arg i8** %a, i8*
+  %p8 = va_arg i8** %a, i64
+  %p9 = va_arg i8** %a, double
+  call void @llvm.va_end(i8* nonnull %a8)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %a8)
+  %pf0 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0), i32 %p0)
+  %p1.s32 = sext i16 %p1 to i32
+  %pf1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.1, i64 0, i64 0), i32 %p1.s32)
+  %p2.s32 = sext i8 %p2 to i32
+  %pf2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.2, i64 0, i64 0), i32 %p2.s32)
+  %pf3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.3, i64 0, i64 0), i32 %p3)
+  %p4.z32 = zext i16 %p4 to i32
+  %pf4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.4, i64 0, i64 0), i32 %p4.z32)
+  %p5.z32 = zext i8 %p5 to i32
+  %pf5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.5, i64 0, i64 0), i32 %p5.z32)
+  %p6.d = fpext float %p6 to double
+  %pf6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.6, i64 0, i64 0), double %p6.d)
+  %pf7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.7, i64 0, i64 0), i8* %p7)
+  %pf8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.8, i64 0, i64 0), i64 %p8)
+  %pf9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.9, i64 0, i64 0), double %p9)
+  ret i32 0
+}
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+declare i32 @printf(i8* nocapture readonly, ...)
diff --git a/llvm/test/CodeGen/VE/va_callee.ll b/llvm/test/CodeGen/VE/va_callee.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/va_callee.ll
@@ -0,0 +1,152 @@
+; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
+
+define i32 @va_func(i32, ...) {
+; CHECK-LABEL: va_func:
+; CHECK:       ldl.sx %s0, 184(,%s9)
+; CHECK:       ld2b.sx %s18, 192(,%s9)
+; CHECK:       ld1b.sx %s19, 200(,%s9)
+; CHECK:       ldl.sx %s20, 208(,%s9)
+; CHECK:       ld2b.zx %s21, 216(,%s9)
+; CHECK:       ld1b.zx %s22, 224(,%s9)
+; CHECK:       ldu %s23, 236(,%s9)
+; CHECK:       ld %s24, 240(,%s9)
+; CHECK:       ld %s25, 248(,%s9)
+
+  %va = alloca i8*, align 8
+  %va.i8 = bitcast i8** %va to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %va.i8)
+  call void @llvm.va_start(i8* nonnull %va.i8)
+  %p1 = va_arg i8** %va, i32
+  %p2 = va_arg i8** %va, i16
+  %p3 = va_arg i8** %va, i8
+  %p4 = va_arg i8** %va, i32
+  %p5 = va_arg i8** %va, i16
+  %p6 = va_arg i8** %va, i8
+  %p7 = va_arg i8** %va, float
+  %p8 = va_arg i8** %va, i8*
+  %p9 = va_arg i8** %va, i64
+  %p10 = va_arg i8** %va, double
+  call void @llvm.va_end(i8* nonnull %va.i8)
+  call void @use_i32(i32 %p1)
+  call void @use_s16(i16 %p2)
+  call void @use_s8(i8 %p3)
+  call void @use_i32(i32 %p4)
+  call void @use_u16(i16 %p5)
+  call void @use_u8(i8 %p6)
+  call void @use_float(float %p7)
+  call void @use_i8p(i8* %p8)
+  call void @use_i64(i64 %p9)
+  call void @use_double(double %p10)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %va.i8)
+  ret i32 0
+}
+
+define i32 @va_copy0(i32, ...) {
+; CHECK-LABEL: va_copy0:
+; CHECK:       ldl.sx %s0,
+; CHECK:       ld2b.sx %s18,
+; CHECK:       ld1b.sx %s19,
+; CHECK:       ldl.sx %s20,
+; CHECK:       ld2b.zx %s21,
+; CHECK:       ld1b.zx %s22,
+; CHECK:       ldu %s23,
+; CHECK:       ld %s24,
+; CHECK:       ld %s25,
+
+  %va = alloca i8*, align 8
+  %va.i8 = bitcast i8** %va to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %va.i8)
+  call void @llvm.va_start(i8* nonnull %va.i8)
+  %vb = alloca i8*, align 8
+  %vb.i8 = bitcast i8** %vb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %va.i8)
+  call void @llvm.va_copy(i8* nonnull %vb.i8, i8* nonnull %va.i8)
+  call void @llvm.va_end(i8* nonnull %va.i8)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %va.i8)
+  %p1 = va_arg i8** %vb, i32
+  %p2 = va_arg i8** %vb, i16
+  %p3 = va_arg i8** %vb, i8
+  %p4 = va_arg i8** %vb, i32
+  %p5 = va_arg i8** %vb, i16
+  %p6 = va_arg i8** %vb, i8
+  %p7 = va_arg i8** %vb, float
+  %p8 = va_arg i8** %vb, i8*
+  %p9 = va_arg i8** %vb, i64
+  %p10 = va_arg i8** %vb, double
+  call void @llvm.va_end(i8* nonnull %vb.i8)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %vb.i8)
+  call void @use_i32(i32 %p1)
+  call void @use_s16(i16 %p2)
+  call void @use_s8(i8 %p3)
+  call void @use_i32(i32 %p4)
+  call void @use_u16(i16 %p5)
+  call void @use_u8(i8 %p6)
+  call void @use_float(float %p7)
+  call void @use_i8p(i8* %p8)
+  call void @use_i64(i64 %p9)
+  call void @use_double(double %p10)
+  ret i32 0
+}
+
+define i32 @va_copy8(i32, ...) {
+; CHECK-LABEL: va_copy8:
+; CHECK:       ldl.sx %s0,
+; CHECK:       ld2b.sx %s18,
+; CHECK:       ld1b.sx %s19,
+; CHECK:       ldl.sx %s20,
+; CHECK:       ld2b.zx %s21,
+; CHECK:       ld1b.zx %s22,
+; CHECK:       ldu %s23,
+; CHECK:       ld %s24,
+; CHECK:       ld %s25,
+
+  %va = alloca i8*, align 8
+  %va.i8 = bitcast i8** %va to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %va.i8)
+  call void @llvm.va_start(i8* nonnull %va.i8)
+  %p1 = va_arg i8** %va, i32
+  %p2 = va_arg i8** %va, i16
+  %p3 = va_arg i8** %va, i8
+  %p4 = va_arg i8** %va, i32
+  %p5 = va_arg i8** %va, i16
+  %p6 = va_arg i8** %va, i8
+  %p7 = va_arg i8** %va, float
+
+  %vc = alloca i8*, align 8
+  %vc.i8 = bitcast i8** %vc to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %va.i8)
+  call void @llvm.va_copy(i8* nonnull %vc.i8, i8* nonnull %va.i8)
+  call void @llvm.va_end(i8* nonnull %va.i8)
+  %p8 = va_arg i8** %vc, i8*
+  %p9 = va_arg i8** %vc, i64
+  %p10 = va_arg i8** %vc, double
+  call void @llvm.va_end(i8* nonnull %vc.i8)
+  call void @use_i32(i32 %p1)
+  call void @use_s16(i16 %p2)
+  call void @use_s8(i8 %p3)
+  call void @use_i32(i32 %p4)
+  call void @use_u16(i16 %p5)
+  call void @use_u8(i8 %p6)
+  call void @use_float(float %p7)
+  call void @use_i8p(i8* %p8)
+  call void @use_i64(i64 %p9)
+  call void @use_double(double %p10)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %va.i8)
+  ret i32 0
+}
+
+declare void @use_i64(i64)
+declare void @use_i32(i32)
+declare void @use_u16(i16 zeroext)
+declare void @use_u8(i8 zeroext)
+declare void @use_s16(i16 signext)
+declare void @use_s8(i8 signext)
+declare void @use_i8p(i8*)
+declare void @use_float(float)
+declare void @use_double(double)
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+declare void @llvm.va_end(i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/llvm/test/CodeGen/VE/va_caller.ll b/llvm/test/CodeGen/VE/va_caller.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/va_caller.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
+
+declare i32 @func(i32, ...)
+
+define i32 @caller() {
+; CHECK-LABEL: caller:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    st %s18, 48(,%s9) # 8-byte Folded Spill
+; CHECK-NEXT:    or %s7, 0, (0)1
+; CHECK-NEXT:    st %s7, 280(,%s11)
+; CHECK-NEXT:    or %s0, 11, (0)1
+; CHECK-NEXT:    st %s0, 272(,%s11)
+; CHECK-NEXT:    st %s7, 264(,%s11)
+; CHECK-NEXT:    or %s0, 10, (0)1
+; CHECK-NEXT:    st %s0, 256(,%s11)
+; CHECK-NEXT:    lea.sl %s0, 1075970048
+; CHECK-NEXT:    st %s0, 248(,%s11)
+; CHECK-NEXT:    or %s0, 8, (0)1
+; CHECK-NEXT:    st %s0, 240(,%s11)
+; CHECK-NEXT:    st %s7, 232(,%s11)
+; CHECK-NEXT:    lea %s0, 1086324736
+; CHECK-NEXT:    stl %s0, 228(,%s11)
+; CHECK-NEXT:    or %s5, 5, (0)1
+; CHECK-NEXT:    stl %s5, 216(,%s11)
+; CHECK-NEXT:    or %s4, 4, (0)1
+; CHECK-NEXT:    stl %s4, 208(,%s11)
+; CHECK-NEXT:    or %s3, 3, (0)1
+; CHECK-NEXT:    stl %s3, 200(,%s11)
+; CHECK-NEXT:    or %s2, 2, (0)1
+; CHECK-NEXT:    stl %s2, 192(,%s11)
+; CHECK-NEXT:    or %s1, 1, (0)1
+; CHECK-NEXT:    stl %s1, 184(,%s11)
+; CHECK-NEXT:    or %s18, 0, (0)1
+; CHECK-NEXT:    lea %s0, func@lo
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lea.sl %s12, func@hi(%s0)
+; CHECK-NEXT:    lea.sl %s0, 1086324736
+; CHECK-NEXT:    stl %s18, 176(,%s11)
+; CHECK-NEXT:    or %s6, 0, %s0
+; CHECK-NEXT:    or %s0, 0, %s18
+; CHECK-NEXT:    bsic %lr, (,%s12)
+; CHECK-NEXT:    or %s0, 0, %s18
+; CHECK-NEXT:    ld %s18, 48(,%s9) # 8-byte Folded Reload
+; CHECK-NEXT:    or %s11, 0, %s9
+  call i32 (i32, ...) @func(i32 0, i16 1, i8 2, i32 3, i16 4, i8 5, float 6.0, i8* null, i64 8, double 9.0, i128 10, i128 11)
+  ret i32 0
+}