Index: include/llvm/CodeGen/CommandFlags.h
===================================================================
--- include/llvm/CodeGen/CommandFlags.h
+++ include/llvm/CodeGen/CommandFlags.h
@@ -45,6 +45,11 @@
        cl::desc("Target specific attributes (-mattr=help for details)"),
        cl::value_desc("a1,+a2,-a3,..."));
 
+cl::opt<unsigned> RegParm(
+    "regparm", cl::desc("set number of register parameters (X86 only)"),
+    cl::value_desc("[0-3]"),
+    cl::init(0));
+
 cl::opt<Reloc::Model> RelocModel(
     "relocation-model", cl::desc("Choose relocation model"),
     cl::values(
@@ -283,6 +288,7 @@
       EnableHonorSignDependentRoundingFPMath;
   if (FloatABIForCalls != FloatABI::Default)
     Options.FloatABIType = FloatABIForCalls;
+  Options.RegParm      = RegParm;
   Options.NoZerosInBSS = DontPlaceZerosInBSS;
   Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
   Options.StackAlignmentOverride = OverrideStackAlignment;
Index: include/llvm/Target/TargetOptions.h
===================================================================
--- include/llvm/Target/TargetOptions.h
+++ include/llvm/Target/TargetOptions.h
@@ -108,8 +108,8 @@
           DisableIntegratedAS(false), CompressDebugSections(false),
           RelaxELFRelocations(false), FunctionSections(false),
           DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
-          EmulatedTLS(false), EnableIPRA(false),
-          FloatABIType(FloatABI::Default),
+          EmulatedTLS(false), EnableIPRA(false), RegParm(0),
+          FloatABIType(FloatABI::Default), 
           AllowFPOpFusion(FPOpFusion::Standard),
           ThreadModel(ThreadModel::POSIX),
           EABIVersion(EABI::Default), DebuggerTuning(DebuggerKind::Default),
@@ -225,6 +225,9 @@
     /// This flag enables InterProcedural Register Allocation (IPRA).
     unsigned EnableIPRA : 1;
 
+    /// RegParm - The initial RegParm Value
+    unsigned RegParm;
+
     /// FloatABIType - This setting is set by -float-abi=xxx option is specfied
     /// on the command line. This setting may either be Default, Soft, or Hard.
     /// Default selects the target's default behavior. Soft selects the ABI for
Index: lib/Target/X86/X86CallingConv.h
===================================================================
--- lib/Target/X86/X86CallingConv.h
+++ lib/Target/X86/X86CallingConv.h
@@ -18,6 +18,9 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "X86TargetMachine.h"
 
 namespace llvm {
 
@@ -50,18 +53,30 @@
     " doesn't support long double and mask types yet.");
 }
 
-inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
+inline bool CC_X86_32_AssignToReg_NoSplit(unsigned &ValNo, MVT &ValVT,
                                          MVT &LocVT,
                                          CCValAssign::LocInfo &LocInfo,
                                          ISD::ArgFlagsTy &ArgFlags,
                                          CCState &State) {
-  // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
-  // not to split i64 and double between a register and stack
-  static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
-  static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
-  
+  // If the argument is InAlloc or ByVal bail.
+  if (ArgFlags.isInAlloca() || ArgFlags.isByVal())
+    return false;
+
+  // Similiar to AssignToReg, but do not split multi-reg args
+  // (i64/double) between a register and stack.
+  MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+  static const unsigned MaxRegs = sizeof(RegList)/sizeof(RegList[0]);
+
+  auto NumRegs = State.getMachineFunction().getTarget().Options.RegParm;
+  if (static_cast<const X86Subtarget&>(State.getMachineFunction().getSubtarget()).isTargetMCU())
+    NumRegs = MaxRegs;
+
+  assert(NumRegs <= MaxRegs && "More register parameters than registers");
+
   SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
 
+  unsigned FirstFree = std::min(NumRegs, State.getFirstUnallocated(RegList));
+
   // If this is the first part of an double/i64/i128, or if we're already
   // in the middle of a split, add to the pending list. If this is not
   // the end of the split, return, otherwise go on to process the pending
@@ -76,10 +91,11 @@
   // If there are no pending members, we are not in the middle of a split,
   // so do the usual inreg stuff.
   if (PendingMembers.empty()) {
-    if (unsigned Reg = State.AllocateReg(RegList)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return true;
-    }
+    if (FirstFree < NumRegs)
+      if (unsigned Reg = State.AllocateReg(RegList[FirstFree++])) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return true;
+      }
     return false;
   }
 
@@ -87,17 +103,19 @@
 
   // We now have the entire original argument in PendingMembers, so decide
   // whether to use registers or the stack.
-  // Per the MCU ABI:
   // a) To use registers, we need to have enough of them free to contain
   // the entire argument.
   // b) We never want to use more than 2 registers for a single argument.
 
-  unsigned FirstFree = State.getFirstUnallocated(RegList);
   bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
 
   for (auto &It : PendingMembers) {
+    // If available, always allocate register so subsequent
+    // arguments cannot use them.
     if (UseRegs)
       It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+    else if (FirstFree < MaxRegs)
+      It.convertToMem(State.AllocateStack(4, 4, RegList[FirstFree++]));
     else
       It.convertToMem(State.AllocateStack(4, 4));
     State.addLoc(It);
@@ -111,4 +129,3 @@
 } // End llvm namespace
 
 #endif
-
Index: lib/Target/X86/X86CallingConv.td
===================================================================
--- lib/Target/X86/X86CallingConv.td
+++ lib/Target/X86/X86CallingConv.td
@@ -796,24 +796,22 @@
   // The first 3 integer arguments, if marked 'inreg' and if the call is not
   // a vararg call, are passed in integer registers.
   CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+  // Assign to Reg if RegParm flag
+  CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_AssignToReg_NoSplit">>>,
 
   // Otherwise, same as everything else.
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
-def CC_X86_32_MCU : CallingConv<[
-  // Handles byval parameters.  Note that, like FastCC, we can't rely on
-  // the delegation to CC_X86_32_Common because that happens after code that
-  // puts arguments in registers.
-  CCIfByVal<CCPassByVal<4, 4>>,
 
+def CC_X86_32_MCU : CallingConv<[
   // Promote i1/i8/i16 arguments to i32.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
 
   // If the call is not a vararg call, some arguments may be passed
   // in integer registers.
-  CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
-
+  CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_AssignToReg_NoSplit">>>,
+  
   // Otherwise, same as everything else.
   CCDelegateTo<CC_X86_32_Common>
 ]>;
@@ -990,7 +988,6 @@
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
   CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
-
   // Otherwise, drop to normal X86-32 CC
   CCDelegateTo<CC_X86_32_C>
 ]>;
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -1,3 +1,4 @@
+
 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -104,6 +105,11 @@
       addBypassSlowDiv(64, 16);
   }
 
+  // Set all builtin calling conventions to BuiltinCC.
+  auto BuiltinCC = CallingConv::C;
+  for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
+    setLibcallCallingConv((RTLIB::Libcall)i, BuiltinCC);
+
   if (Subtarget.isTargetKnownWindowsMSVC() ||
       Subtarget.isTargetWindowsItanium()) {
     // Setup Windows compiler runtime calls.
Index: test/CodeGen/X86/pr18415.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/pr18415.ll
@@ -0,0 +1,52 @@
+; RUN: llc %s -mtriple i386-unknown-linux-gnu -regparm 0 -o - | FileCheck %s -check-prefix CHECK0
+; RUN: llc %s -mtriple i386-unknown-linux-gnu -regparm 3 -o - | FileCheck %s -check-prefix CHECK3
+
+; ModuleID = '/usr/local/google/home/niravd/pr18415.c'
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @use_foo(i8* inreg %dest, i8* inreg %src, i32 inreg %n) #0 {
+
+; CHECK0-LABEL: @use_foo
+; CHECK0-NOT: pushl
+; CHECK0: jmp foo
+; CHECK0-NOT: retl
+; CHECK3-LABEL: @use_foo
+; CHECK3-NOT: pushl
+; CHECK3: jmp foo
+; CHECK3-NOT: retl
+%1 = tail call i8* @foo(i8* %dest, i8* %src, i32 %n) #4
+  ret void
+}
+
+declare i8* @foo(i8* inreg, i8* inreg, i32 inreg) #1
+
+; Function Attrs: norecurse nounwind
+define void @use_memcpy(i8* inreg nocapture %dest, i8* inreg nocapture readonly %src, i32 inreg %n) #2 {
+; CHECK0-LABEL: @use_memcpy
+; CHECK0:	pushl	%ecx
+; CHECK0:	pushl	%edx
+; CHECK0: 	pushl	%eax
+; CHECK0: 	calll memcpy
+; CHECK0:	retl
+; CHECK3-LABEL: @use_memcpy
+; CHECK3-NOT: pushl
+; CHECK3: jmp memcpy
+; CHECK3-NOT: retl
+tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #3
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "regparm"="2"}
+attributes #3 = { argmemonly nounwind}
+attributes #4 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0-2ubuntu3~trusty4 (tags/RELEASE_380/final)"}
Index: test/CodeGen/X86/regparm.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/regparm.ll
@@ -0,0 +1,104 @@
+; RUN: llc %s -mtriple i386-unknown-linux-gnu -regparm 0 -o - | FileCheck %s -check-prefix CHECK0
+; RUN: llc %s -mtriple i386-unknown-linux-gnu -regparm 1 -o - | FileCheck %s -check-prefix CHECK1
+; RUN: llc %s -mtriple i386-unknown-linux-gnu -regparm 2 -o - | FileCheck %s -check-prefix CHECK2
+; RUN: llc %s -mtriple i386-unknown-linux-gnu -regparm 3 -o - | FileCheck %s -check-prefix CHECK3
+target triple = "i386-unknown-linux-gnu"
+
+;CHECK0-LABEL: @test0
+;CHECK0:      movl 4(%esp), %eax
+;CHECK0-NEXT: addl 8(%esp), %eax 
+;CHECK0-NEXT: addl 12(%esp), %eax 
+;CHECK0-NEXT: retl
+
+;CHECK1-LABEL: @test0
+;CHECK1:      addl 4(%esp), %eax
+;CHECK1-NEXT: addl 8(%esp), %eax 
+;CHECK1-NEXT: retl
+
+;CHECK2-LABEL: @test0
+;CHECK2:      addl %edx, %eax
+;CHECK2-NEXT: addl 4(%esp), %eax 
+;CHECK2-NEXT: retl
+
+;CHECK3-LABEL: @test0
+;CHECK3:      addl %edx, %eax
+;CHECK3-NEXT: addl %ecx, %eax 
+;CHECK3-NEXT: retl
+
+define i32 @test0(i32 %a, i32 %b, i32 %c, i32 %d) {
+ %1 = add i32 %a, %b
+ %2 = add i32 %1, %c
+ %3 = add i32 %2, %d
+ ret i32 %2
+}
+
+; i64 requires 2 registers. If it does not fit, the 1 register is still allocated.
+
+;CHECK1-LABEL: @test1
+;CHECK1:      movl 4(%esp), %eax
+;CHECK1-NEXT: addl 12(%esp), %eax
+;CHECK1-NEXT: addl  8(%esp), %eax
+;CHECK1-NEXT: addl 16(%esp), %eax
+;CHECK1-NEXT: addl 20(%esp), %eax 
+;CHECK1-NEXT: retl
+
+;CHECK2-LABEL: @test1
+;CHECK2:      addl 4(%esp), %eax
+;CHECK2-NEXT: addl   %edx, %eax
+;CHECK2-NEXT: addl 8(%esp), %eax
+;CHECK2-NEXT: addl 12(%esp), %eax 
+;CHECK2-NEXT: retl
+
+;CHECK3-LABEL: @test1
+;CHECK3:      addl %ecx, %eax
+;CHECK3-NEXT: addl %edx, %eax
+;CHECK3-NEXT: addl 4(%esp), %eax
+;CHECK3-NEXT: addl 8(%esp), %eax 
+;CHECK3-NEXT: retl
+
+define i32 @test1(i64 %a, i32 %b, i32 %c, i32 %d) {
+  %shr = lshr i64 %a, 32
+  %conv = trunc i64 %shr to i32
+  %conv1 = trunc i64 %a to i32
+  %add = add i32 %conv1, %b
+  %add2 = add i32 %add, %conv
+  %add3 = add i32 %add2, %c
+  %add4 = add i32 %add3, %d
+  ret i32 %add4
+}
+
+;CHECK1-LABEL: @test2
+;CHECK1:      addl	4(%esp), %eax
+;CHECK1-NEXT: addl	8(%esp), %eax
+;CHECK1-NEXT: addl	12(%esp), %eax
+;CHECK1-NEXT: addl	16(%esp), %eax
+;CHECK1-NEXT: retl
+;CHECK2-LABEL: @test2
+;CHECK2:      addl	4(%esp), %eax
+;CHECK2-NEXT: addl	8(%esp), %eax
+;CHECK2-NEXT: addl	12(%esp), %eax
+;CHECK2-NEXT: addl	16(%esp), %eax
+;CHECK2-NEXT: retl
+;CHECK3-LABEL: @test2
+;CHECK3:      addl %edx, %eax
+;CHECK3-NEXT: addl %ecx, %eax
+;CHECK3-NEXT: addl 4(%esp), %eax
+;CHECK3-NEXT: addl 8(%esp), %eax 
+;CHECK3-NEXT: retl
+
+define i32 @test2(i32 %b, i64 %a, i32 %c, i32 %d) {
+  %shr = lshr i64 %a, 32
+  %conv = trunc i64 %shr to i32
+  %conv1 = trunc i64 %a to i32
+  %add = add i32 %conv1, %b
+  %add2 = add i32 %add, %conv
+  %add3 = add i32 %add2, %c
+  %add4 = add i32 %add3, %d
+  ret i32 %add4
+}
+
+
+
+
+
+