Index: lib/Target/NDS32/NDS32ISelLowering.h
===================================================================
--- lib/Target/NDS32/NDS32ISelLowering.h
+++ lib/Target/NDS32/NDS32ISelLowering.h
@@ -138,6 +138,30 @@
     SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
                           unsigned Flag) const;
 
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+
+    /// copyByValArg - Copy argument registers which were used to pass a byval
+    /// argument to the stack. Create a stack frame object for the byval
+    /// argument.
+    void copyByValRegs(SDValue Chain, const SDLoc &DL,
+                       std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                       const ISD::ArgFlagsTy &Flags,
+                       SmallVectorImpl<SDValue> &InVals,
+                       const Argument *FuncArg, unsigned FirstReg,
+                       unsigned LastReg, const CCValAssign &VA,
+                       CCState &State) const;
+
+    /// passByValArg - Pass a byval argument in registers or on stack.
+    void passByValArg(SDValue Chain, const SDLoc &DL,
+                      RegsToPassVector &RegsToPass,
+                      SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+                      MachineFrameInfo &MFI, SelectionDAG &DAG, SDValue Arg,
+                      unsigned FirstReg, unsigned LastReg,
+                      const ISD::ArgFlagsTy &Flags, bool isLittle,
+                      const CCValAssign &VA) const;
+
+    void HandleByVal(CCState *, unsigned &, unsigned) const override;
+
     /// RestoreVarArgRegs - Restore variable function arguments passed in
     /// registers to the stack. Also create a stack frame object for the
     /// first variable argument.
@@ -151,8 +175,6 @@
                          const SDLoc &dl, SelectionDAG &DAG,
                          SmallVectorImpl<SDValue> &InVals) const override;
 
-    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
-
     SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const override;
Index: lib/Target/NDS32/NDS32ISelLowering.cpp
===================================================================
--- lib/Target/NDS32/NDS32ISelLowering.cpp
+++ lib/Target/NDS32/NDS32ISelLowering.cpp
@@ -354,6 +354,189 @@
     NDS32::R3, NDS32::R4, NDS32::R5
 };
 
+void NDS32TargetLowering::copyByValRegs(
+    SDValue Chain, const SDLoc &DL, std::vector<SDValue> &OutChains,
+    SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+    SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
+    unsigned FirstReg, unsigned LastReg, const CCValAssign &VA,
+    CCState &State) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned GPRSizeInBytes = 4;
+  unsigned NumRegs = LastReg - FirstReg;
+  unsigned RegAreaSize = NumRegs * GPRSizeInBytes;
+  unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
+  int FrameObjOffset;
+  ArrayRef<MCPhysReg> ByValArgRegs = makeArrayRef(NDS32ArgRegs);
+
+  if (RegAreaSize)
+    FrameObjOffset =
+        - (int)((ByValArgRegs.size() - FirstReg) * GPRSizeInBytes);
+  else
+    FrameObjOffset = VA.getLocMemOffset();
+
+  // Create frame object.
+  EVT PtrTy = getPointerTy(DAG.getDataLayout());
+  int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+  SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+  InVals.push_back(FIN);
+
+  if (!NumRegs)
+    return;
+
+  // Copy arg registers.
+  MVT RegTy = MVT::getIntegerVT(GPRSizeInBytes * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+
+  for (unsigned I = 0; I < NumRegs; ++I) {
+    unsigned ArgReg = ByValArgRegs[FirstReg + I];
+    unsigned VReg = addLiveIn(MF, ArgReg, RC);
+    unsigned Offset = I * GPRSizeInBytes;
+    SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
+                                   DAG.getConstant(Offset, DL, PtrTy));
+    SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
+                                 StorePtr, MachinePointerInfo(FuncArg, Offset));
+    OutChains.push_back(Store);
+  }
+}
+
+// Copy byVal arg to registers and stack.
+void NDS32TargetLowering::passByValArg(
+    SDValue Chain, const SDLoc &DL,
+    RegsToPassVector &RegsToPass,
+    SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+    MachineFrameInfo &MFI, SelectionDAG &DAG, SDValue Arg, unsigned FirstReg,
+    unsigned LastReg, const ISD::ArgFlagsTy &Flags, bool isLittle,
+    const CCValAssign &VA) const {
+  unsigned ByValSizeInBytes = Flags.getByValSize();
+  unsigned OffsetInBytes = 0; // From beginning of struct
+  unsigned RegSizeInBytes = 4;
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+  EVT PtrTy = getPointerTy(DAG.getDataLayout()),
+      RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+  unsigned NumRegs = LastReg - FirstReg;
+
+  if (NumRegs) {
+    ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(NDS32ArgRegs);
+    bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
+    unsigned I = 0;
+
+    // Copy words to registers.
+    for (; I < NumRegs - LeftoverBytes; ++I, OffsetInBytes += RegSizeInBytes) {
+      SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                    DAG.getConstant(OffsetInBytes, DL, PtrTy));
+      SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
+                                    MachinePointerInfo(), Alignment);
+      MemOpChains.push_back(LoadVal.getValue(1));
+      unsigned ArgReg = ArgRegs[FirstReg + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
+    }
+
+    // Return if the struct has been fully copied.
+    if (ByValSizeInBytes == OffsetInBytes)
+      return;
+
+    // Copy the remainder of the byval argument with sub-word loads and shifts.
+    if (LeftoverBytes) {
+      SDValue Val;
+
+      for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
+           OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
+        unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;
+
+        if (RemainingSizeInBytes < LoadSizeInBytes)
+          continue;
+
+        // Load subword.
+        SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                      DAG.getConstant(OffsetInBytes, DL,
+                                                      PtrTy));
+        SDValue LoadVal = DAG.getExtLoad(
+            ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
+            MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
+        MemOpChains.push_back(LoadVal.getValue(1));
+
+        // Shift the loaded value.
+        unsigned Shamt;
+
+        if (isLittle)
+          Shamt = TotalBytesLoaded * 8;
+        else
+          Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;
+
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
+                                    DAG.getConstant(Shamt, DL, MVT::i32));
+
+        if (Val.getNode())
+          Val = DAG.getNode(ISD::OR, DL, RegTy, Val, Shift);
+        else
+          Val = Shift;
+
+        OffsetInBytes += LoadSizeInBytes;
+        TotalBytesLoaded += LoadSizeInBytes;
+        Alignment = std::min(Alignment, LoadSizeInBytes);
+      }
+
+      unsigned ArgReg = ArgRegs[FirstReg + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, Val));
+      return;
+    }
+  }
+
+  // Copy remainder of byval arg to it with memcpy.
+  unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
+  SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                            DAG.getConstant(OffsetInBytes, DL, PtrTy));
+  SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
+                            DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
+                        DAG.getConstant(MemCpySize, DL, PtrTy),
+                        Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
+                        /*isTailCall=*/false,
+                        MachinePointerInfo(), MachinePointerInfo());
+  MemOpChains.push_back(Chain);
+}
+
+void NDS32TargetLowering::HandleByVal(CCState *State, unsigned &Size,
+                                      unsigned Align) const {
+  const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
+
+  assert(Size && "Byval argument's size shouldn't be 0.");
+
+  Align = std::min(Align, TFL->getStackAlignment());
+
+  unsigned FirstReg = 0;
+  unsigned NumRegs = 0;
+
+  if (State->getCallingConv() != CallingConv::Fast) {
+    unsigned RegSizeInBytes = 4;
+    ArrayRef<MCPhysReg> IntArgRegs = makeArrayRef(NDS32ArgRegs);
+    const MCPhysReg *ShadowRegs = IntArgRegs.data();
+
+    // We used to check the size as well but we can't do that anymore since
+    // CCState::HandleByVal() rounds up the size after calling this function.
+    assert(!(Align % RegSizeInBytes) &&
+           "Byval argument's alignment should be a multiple of"
+           "RegSizeInBytes.");
+
+    FirstReg = State->getFirstUnallocated(IntArgRegs);
+
+    // If Align > RegSizeInBytes, the first arg register must be even.
+    if ((Align > RegSizeInBytes) && (FirstReg % 2)) {
+      State->AllocateReg(IntArgRegs[FirstReg], ShadowRegs[FirstReg]);
+      ++FirstReg;
+    }
+
+    // Mark the registers allocated.
+    Size = alignTo(Size, RegSizeInBytes);
+    for (unsigned I = FirstReg; Size > 0 && (I < IntArgRegs.size());
+         Size -= RegSizeInBytes, ++I, ++NumRegs)
+      State->AllocateReg(IntArgRegs[I], ShadowRegs[I]);
+  }
+
+  State->addInRegsParamInfo(FirstReg, FirstReg + NumRegs);
+}
+
 // RestoreVarArgRegs - Store VarArg register to the stack
 void NDS32TargetLowering::RestoreVarArgRegs(std::vector<SDValue> &OutChains,
                                             SDValue Chain, const SDLoc &DL,
@@ -448,6 +631,23 @@
       std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx);
       CurArgIdx = Ins[i].getOrigArgIndex();
     }
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+    if (Flags.isByVal()) {
+      assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit");
+      unsigned FirstByValReg, LastByValReg;
+      unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+      CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValIdx < CCInfo.getInRegsParamsCount());
+      copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
+                    FirstByValReg, LastByValReg, VA, CCInfo);
+      CCInfo.nextInRegsParam();
+      continue;
+    }
+
     bool IsRegLoc = VA.isRegLoc();
     // Arguments stored on registers
     if (IsRegLoc) {
@@ -573,6 +773,9 @@
   CallingConv::ID CallConv              = CLI.CallConv;
   bool IsVarArg                         = CLI.IsVarArg;
 
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
   // NDS32 target does not support tail call optimization yet.
   IsTailCall = false;
 
@@ -609,6 +812,23 @@
     CCValAssign &VA = ArgLocs[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
+    // ByVal Arg.
+    if (Flags.isByVal()) {
+      unsigned FirstByValReg, LastByValReg;
+      unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+      CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValIdx < CCInfo.getInRegsParamsCount());
+      assert(!IsTailCall &&
+             "Do not tail-call optimize if there is a byval argument.");
+      passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
+                   FirstByValReg, LastByValReg, Flags, true, VA);
+      CCInfo.nextInRegsParam();
+      continue;
+    }
+
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
Index: test/CodeGen/NDS32/by-val.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NDS32/by-val.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "nds32le---elf"
+
+%struct.tiny = type { i32 }
+
+; Function Attrs: nounwind
+define i32 @f(i32 %n, %struct.tiny* byval nocapture readonly align 4 %x, %struct.tiny* byval nocapture readonly align 4 %y, %struct.tiny* byval nocapture readonly align 4 %z) local_unnamed_addr #0 {
+entry:
+  %c = getelementptr inbounds %struct.tiny, %struct.tiny* %x, i32 0, i32 0
+  %0 = load i32, i32* %c, align 4
+  %cmp = icmp eq i32 %0, 10
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @abort() #4
+  unreachable
+
+if.end:                                           ; preds = %entry
+  %c1 = getelementptr inbounds %struct.tiny, %struct.tiny* %y, i32 0, i32 0
+  %1 = load i32, i32* %c1, align 4
+  %cmp2 = icmp eq i32 %1, 11
+  br i1 %cmp2, label %if.end4, label %if.then3
+
+if.then3:                                         ; preds = %if.end
+  tail call void @abort() #4
+  unreachable
+
+if.end4:                                          ; preds = %if.end
+  %c5 = getelementptr inbounds %struct.tiny, %struct.tiny* %z, i32 0, i32 0
+  %2 = load i32, i32* %c5, align 4
+  %cmp6 = icmp eq i32 %2, 12
+  br i1 %cmp6, label %if.end8, label %if.then7
+
+if.then7:                                         ; preds = %if.end4
+  tail call void @abort() #4
+  unreachable
+
+if.end8:                                          ; preds = %if.end4
+  ret i32 undef
+}
+
+; Function Attrs: noreturn
+declare void @abort() local_unnamed_addr #1
+
+; Function Attrs: noreturn nounwind
+define i32 @main() local_unnamed_addr #2 {
+entry:
+  %x = alloca [3 x %struct.tiny], align 4
+  %0 = bitcast [3 x %struct.tiny]* %x to i8*
+  call void @llvm.lifetime.start(i64 12, i8* nonnull %0) #5
+  %arrayidx = getelementptr inbounds [3 x %struct.tiny], [3 x %struct.tiny]* %x, i32 0, i32 0
+  %c = getelementptr inbounds [3 x %struct.tiny], [3 x %struct.tiny]* %x, i32 0, i32 0, i32 0
+  store i32 10, i32* %c, align 4
+  %arrayidx1 = getelementptr inbounds [3 x %struct.tiny], [3 x %struct.tiny]* %x, i32 0, i32 1
+  %c2 = getelementptr inbounds %struct.tiny, %struct.tiny* %arrayidx1, i32 0, i32 0
+  store i32 11, i32* %c2, align 4
+  %arrayidx3 = getelementptr inbounds [3 x %struct.tiny], [3 x %struct.tiny]* %x, i32 0, i32 2
+  %c4 = getelementptr inbounds %struct.tiny, %struct.tiny* %arrayidx3, i32 0, i32 0
+  store i32 12, i32* %c4, align 4
+; CHECK: movi55  $r0, 10
+; CHECK: swi     $r0, [$sp + (0)]
+; CHECK: movi55  $r1, 4
+; CHECK: movi55  $r2, 11
+; CHECK: sw      $r2, [$r1 + $r0]
+; CHECK: movi55  $r2, 8
+; CHECK: movi55  $r3, 12
+; CHECK: sw      $r3, [$r2 + $r0]
+; CHECK: lw      $r3, [$r2 + $r0]
+; CHECK: lw      $r2, [$r1 + $r0]
+; CHECK: lwi     $r1, [$sp + (0)]
+  %call = call i32 @f(i32 undef, %struct.tiny* byval nonnull align 4 %arrayidx, %struct.tiny* byval align 4 %arrayidx1, %struct.tiny* byval align 4 %arrayidx3)
+  tail call void @exit(i32 0) #4
+  unreachable
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: noreturn
+declare void @exit(i32) local_unnamed_addr #1