diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
+++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
@@ -88,6 +88,7 @@
   Kind TheKind;
   bool PaddingInReg : 1;
   bool InAllocaSRet : 1;    // isInAlloca()
+  bool InAllocaIndirect : 1;// isInAlloca()
   bool IndirectByVal : 1;   // isIndirect()
   bool IndirectRealign : 1; // isIndirect()
   bool SRetAfterThis : 1;   // isIndirect()
@@ -110,8 +111,8 @@
 
 public:
   ABIArgInfo(Kind K = Direct)
-      : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0),
-        TheKind(K), PaddingInReg(false), InAllocaSRet(false),
+      : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K),
+        PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false),
         IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false),
         InReg(false), CanBeFlattened(false), SignExt(false) {}
 
@@ -185,9 +186,10 @@
     AI.setInReg(true);
     return AI;
   }
-  static ABIArgInfo getInAlloca(unsigned FieldIndex) {
+  static ABIArgInfo getInAlloca(unsigned FieldIndex, bool Indirect = false) {
     auto AI = ABIArgInfo(InAlloca);
     AI.setInAllocaFieldIndex(FieldIndex);
+    AI.setInAllocaIndirect(Indirect);
     return AI;
   }
   static ABIArgInfo getExpand() {
@@ -380,6 +382,15 @@
     AllocaFieldIndex = FieldIndex;
   }
 
+  unsigned getInAllocaIndirect() const {
+    assert(isInAlloca() && "Invalid kind!");
+    return InAllocaIndirect;
+  }
+  void setInAllocaIndirect(bool Indirect) {
+    assert(isInAlloca() && "Invalid kind!");
+    InAllocaIndirect = Indirect;
+  }
+
   /// Return true if this field of an inalloca struct should be returned
   /// to implement a struct return calling convention.
   bool getInAllocaSRet() const {
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2339,6 +2339,9 @@
       auto FieldIndex = ArgI.getInAllocaFieldIndex();
       Address V =
           Builder.CreateStructGEP(ArgStruct, FieldIndex, Arg->getName());
+      if (ArgI.getInAllocaIndirect())
+        V = Address(Builder.CreateLoad(V),
+                    getContext().getTypeAlignInChars(Ty));
       ArgVals.push_back(ParamValue::forIndirect(V));
       break;
     }
@@ -4038,18 +4041,39 @@
       assert(NumIRArgs == 0);
       assert(getTarget().getTriple().getArch() == llvm::Triple::x86);
       if (I->isAggregate()) {
-        // Replace the placeholder with the appropriate argument slot GEP.
         Address Addr = I->hasLValue()
                            ? I->getKnownLValue().getAddress(*this)
                            : I->getKnownRValue().getAggregateAddress();
         llvm::Instruction *Placeholder =
             cast<llvm::Instruction>(Addr.getPointer());
-        CGBuilderTy::InsertPoint IP = Builder.saveIP();
-        Builder.SetInsertPoint(Placeholder);
-        Addr =
-            Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex());
-        Builder.restoreIP(IP);
+
+        if (!ArgInfo.getInAllocaIndirect()) {
+          // Replace the placeholder with the appropriate argument slot GEP.
+          CGBuilderTy::InsertPoint IP = Builder.saveIP();
+          Builder.SetInsertPoint(Placeholder);
+          Addr = Builder.CreateStructGEP(ArgMemory,
+                                         ArgInfo.getInAllocaFieldIndex());
+          Builder.restoreIP(IP);
+        } else {
+          // For indirect things such as overaligned structs, replace the
+          // placeholder with a regular aggregate temporary alloca. Store the
+          // address of this alloca into the struct.
+          Addr = CreateMemTemp(info_it->type, "inalloca.indirect.tmp");
+          Address ArgSlot = Builder.CreateStructGEP(
+              ArgMemory, ArgInfo.getInAllocaFieldIndex());
+          Builder.CreateStore(Addr.getPointer(), ArgSlot);
+        }
         deferPlaceholderReplacement(Placeholder, Addr.getPointer());
+      } else if (ArgInfo.getInAllocaIndirect()) {
+        // Make a temporary alloca and store the address of it into the argument
+        // struct.
+        Address Addr = CreateMemTempWithoutCast(
+            I->Ty, getContext().getTypeAlignInChars(I->Ty),
+            "indirect-arg-temp");
+        I->copyInto(*this, Addr);
+        Address ArgSlot =
+            Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex());
+        Builder.CreateStore(Addr.getPointer(), ArgSlot);
       } else {
         // Store the RValue into the argument struct.
         Address Addr =
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -1676,6 +1676,7 @@
   bool IsVectorCall = State.CC == llvm::CallingConv::X86_VectorCall;
 
   Ty = useFirstFieldIfTransparentUnion(Ty);
+  TypeInfo TI = getContext().getTypeInfo(Ty);
 
   // Check with the C++ ABI first.
   const RecordType *RT = Ty->getAs<RecordType>();
@@ -1725,7 +1726,7 @@
     bool NeedsPadding = false;
     bool InReg;
     if (shouldAggregateUseDirect(Ty, State, InReg, NeedsPadding)) {
-      unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32;
+      unsigned SizeInRegs = (TI.Width + 31) / 32;
       SmallVector<llvm::Type*, 3> Elements(SizeInRegs, Int32);
       llvm::Type *Result = llvm::StructType::get(LLVMContext, Elements);
       if (InReg)
@@ -1735,14 +1736,19 @@
     }
     llvm::IntegerType *PaddingType = NeedsPadding ? Int32 : nullptr;
 
+    // Pass over-aligned aggregates on Windows indirectly. This behavior was
+    // added in MSVC 2015.
+    if (IsWin32StructABI && TI.AlignIsRequired && TI.Align > 32)
+      return getIndirectResult(Ty, /*ByVal=*/false, State);
+
     // Expand small (<= 128-bit) record types when we know that the stack layout
     // of those arguments will match the struct. This is important because the
     // LLVM backend isn't smart enough to remove byval, which inhibits many
     // optimizations.
     // Don't do this for the MCU if there are still free integer registers
     // (see X86_64 ABI for full explanation).
-    if (getContext().getTypeSize(Ty) <= 4 * 32 &&
-        (!IsMCUABI || State.FreeRegs == 0) && canExpandIndirectArgument(Ty))
+    if (TI.Width <= 4 * 32 && (!IsMCUABI || State.FreeRegs == 0) &&
+        canExpandIndirectArgument(Ty))
       return ABIArgInfo::getExpandWithPadding(
           IsFastCall || IsVectorCall || IsRegCall, PaddingType);
 
@@ -1750,14 +1756,24 @@
   }
 
   if (const VectorType *VT = Ty->getAs<VectorType>()) {
+    // On Windows, vectors are passed directly if registers are available, or
+    // indirectly if not. This avoids the need to align argument memory. Pass
+    // user-defined vector types larger than 512 bits indirectly for simplicity.
+    if (IsWin32StructABI) {
+      if (TI.Width <= 512 && State.FreeSSERegs > 0) {
+        --State.FreeSSERegs;
+        return ABIArgInfo::getDirectInReg();
+      }
+      return getIndirectResult(Ty, /*ByVal=*/false, State);
+    }
+
     // On Darwin, some vectors are passed in memory, we handle this by passing
     // it as an i8/i16/i32/i64.
     if (IsDarwinVectorABI) {
-      uint64_t Size = getContext().getTypeSize(Ty);
-      if ((Size == 8 || Size == 16 || Size == 32) ||
-          (Size == 64 && VT->getNumElements() == 1))
-        return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
-                                                            Size));
+      if ((TI.Width == 8 || TI.Width == 16 || TI.Width == 32) ||
+          (TI.Width == 64 && VT->getNumElements() == 1))
+        return ABIArgInfo::getDirect(
+            llvm::IntegerType::get(getVMContext(), TI.Width));
     }
 
     if (IsX86_MMXType(CGT.ConvertType(Ty)))
@@ -1787,9 +1803,10 @@
   CCState State(FI);
   if (IsMCUABI)
     State.FreeRegs = 3;
-  else if (State.CC == llvm::CallingConv::X86_FastCall)
+  else if (State.CC == llvm::CallingConv::X86_FastCall) {
     State.FreeRegs = 2;
-  else if (State.CC == llvm::CallingConv::X86_VectorCall) {
+    State.FreeSSERegs = 3;
+  } else if (State.CC == llvm::CallingConv::X86_VectorCall) {
     State.FreeRegs = 2;
     State.FreeSSERegs = 6;
   } else if (FI.getHasRegParm())
@@ -1797,6 +1814,11 @@
   else if (State.CC == llvm::CallingConv::X86_RegCall) {
     State.FreeRegs = 5;
     State.FreeSSERegs = 8;
+  } else if (IsWin32StructABI) {
+    // Since MSVC 2015, the first three SSE vectors have been passed in
+    // registers. The rest are passed indirectly.
+    State.FreeRegs = DefaultNumRegisterParameters;
+    State.FreeSSERegs = 3;
   } else
     State.FreeRegs = DefaultNumRegisterParameters;
 
@@ -1843,16 +1865,25 @@
                                    CharUnits &StackOffset, ABIArgInfo &Info,
                                    QualType Type) const {
   // Arguments are always 4-byte-aligned.
-  CharUnits FieldAlign = CharUnits::fromQuantity(4);
+  CharUnits WordSize = CharUnits::fromQuantity(4);
+  assert(StackOffset.isMultipleOf(WordSize) && "unaligned inalloca struct");
 
-  assert(StackOffset.isMultipleOf(FieldAlign) && "unaligned inalloca struct");
-  Info = ABIArgInfo::getInAlloca(FrameFields.size());
-  FrameFields.push_back(CGT.ConvertTypeForMem(Type));
-  StackOffset += getContext().getTypeSizeInChars(Type);
+  // sret pointers and indirect things will require an extra pointer
+  // indirection, unless they are byval. Most things are byval, and will not
+  // require this indirection.
+  bool IsIndirect = false;
+  if (Info.isIndirect() && !Info.getIndirectByVal())
+    IsIndirect = true;
+  Info = ABIArgInfo::getInAlloca(FrameFields.size(), IsIndirect);
+  llvm::Type *LLTy = CGT.ConvertTypeForMem(Type);
+  if (IsIndirect)
+    LLTy = LLTy->getPointerTo(0);
+  FrameFields.push_back(LLTy);
+  StackOffset += IsIndirect ? WordSize : getContext().getTypeSizeInChars(Type);
 
   // Insert padding bytes to respect alignment.
   CharUnits FieldEnd = StackOffset;
-  StackOffset = FieldEnd.alignTo(FieldAlign);
+  StackOffset = FieldEnd.alignTo(WordSize);
   if (StackOffset != FieldEnd) {
     CharUnits NumBytes = StackOffset - FieldEnd;
     llvm::Type *Ty = llvm::Type::getInt8Ty(getVMContext());
@@ -1866,16 +1897,12 @@
   switch (Info.getKind()) {
   case ABIArgInfo::InAlloca:
     return true;
-  case ABIArgInfo::Indirect:
-    assert(Info.getIndirectByVal());
-    return true;
   case ABIArgInfo::Ignore:
     return false;
+  case ABIArgInfo::Indirect:
   case ABIArgInfo::Direct:
   case ABIArgInfo::Extend:
-    if (Info.getInReg())
-      return false;
-    return true;
+    return !Info.getInReg();
   case ABIArgInfo::Expand:
   case ABIArgInfo::CoerceAndExpand:
     // These are aggregate types which are never passed in registers when
@@ -1909,8 +1936,7 @@
 
   // Put the sret parameter into the inalloca struct if it's in memory.
   if (Ret.isIndirect() && !Ret.getInReg()) {
-    CanQualType PtrTy = getContext().getPointerType(FI.getReturnType());
-    addFieldToArgStruct(FrameFields, StackOffset, Ret, PtrTy);
+    addFieldToArgStruct(FrameFields, StackOffset, Ret, FI.getReturnType());
     // On Windows, the hidden sret parameter is always returned in eax.
     Ret.setInAllocaSRet(IsWin32StructABI);
   }
diff --git a/clang/test/CodeGen/x86_32-arguments-win32.c b/clang/test/CodeGen/x86_32-arguments-win32.c
--- a/clang/test/CodeGen/x86_32-arguments-win32.c
+++ b/clang/test/CodeGen/x86_32-arguments-win32.c
@@ -46,3 +46,47 @@
 struct s6 f6_1(void) { while (1) {} }
 void f6_2(struct s6 a0) {}
 
+
+// MSVC passes up to three vectors in registers, and the rest indirectly.  We
+// (arbitrarily) pass oversized vectors indirectly, since that is the safest way
+// to do it.
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef float __m256 __attribute__((__vector_size__(32), __aligned__(32)));
+typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef float __m1024 __attribute__((__vector_size__(128), __aligned__(128)));
+
+__m128 gv128;
+__m256 gv256;
+__m512 gv512;
+__m1024 gv1024;
+
+void receive_vec_128(__m128 x, __m128 y, __m128 z, __m128 w, __m128 q) {
+  gv128 = x + y + z + w + q;
+}
+void receive_vec_256(__m256 x, __m256 y, __m256 z, __m256 w, __m256 q) {
+  gv256 = x + y + z + w + q;
+}
+void receive_vec_512(__m512 x, __m512 y, __m512 z, __m512 w, __m512 q) {
+  gv512 = x + y + z + w + q;
+}
+void receive_vec_1024(__m1024 x, __m1024 y, __m1024 z, __m1024 w, __m1024 q) {
+  gv1024 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* %0, <4 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, <8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, <16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)
+
+void pass_vec_128() {
+  __m128 z = {0};
+  receive_vec_128(z, z, z, z, z);
+}
+
+// CHECK-LABEL: define dso_local void @pass_vec_128()
+// CHECK: call void @receive_vec_128(<4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float>* %{{[^,)]*}}, <4 x float>* %{{[^,)]*}})
+
+
+void __fastcall fastcall_indirect_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* inreg %0, i32 inreg %edx, <4 x float>* %1)
diff --git a/clang/test/CodeGenCXX/inalloca-overaligned.cpp b/clang/test/CodeGenCXX/inalloca-overaligned.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/inalloca-overaligned.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -fms-extensions -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s
+
+// PR44395
+// MSVC passes overaligned types indirectly since MSVC 2015. Make sure that
+// works with inalloca.
+
+// FIXME: Pass non-trivial *and* overaligned types indirectly. Right now the C++
+// ABI rules say to use inalloca, and they take precedence, so it's not easy to
+// implement this.
+
+
+struct NonTrivial {
+  NonTrivial();
+  NonTrivial(const NonTrivial &o);
+  int x;
+};
+
+struct __declspec(align(64)) OverAligned {
+  OverAligned();
+  int buf[16];
+};
+
+extern int gvi32;
+
+int receive_inalloca_overaligned(NonTrivial nt, OverAligned o) {
+  return nt.x + o.buf[0];
+}
+
+// CHECK-LABEL: define dso_local i32 @"?receive_inalloca_overaligned@@Y{{.*}}"
+// CHECK-SAME: (<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %0)
+
+int pass_inalloca_overaligned() {
+  gvi32 = receive_inalloca_overaligned(NonTrivial(), OverAligned());
+  return gvi32;
+}
+
+// CHECK-LABEL: define dso_local i32 @"?pass_inalloca_overaligned@@Y{{.*}}"
+// CHECK: [[TMP:%[^ ]*]] = alloca %struct.OverAligned, align 64
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: alloca inalloca <{ %struct.NonTrivial, %struct.OverAligned* }>
+
+// Construct OverAligned into TMP.
+// CHECK: call x86_thiscallcc %struct.OverAligned* @"??0OverAligned@@QAE@XZ"(%struct.OverAligned* [[TMP]])
+
+// Construct NonTrivial into the GEP.
+// CHECK: [[GEP:%[^ ]*]] = getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 0
+// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* [[GEP]])
+
+// Store the address of an OverAligned temporary into the struct.
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 1
+// CHECK: store %struct.OverAligned* [[TMP]], %struct.OverAligned** %{{.*}}, align 4
+// CHECK: call i32 @"?receive_inalloca_overaligned@@Y{{.*}}"(<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %argmem)
diff --git a/clang/test/CodeGenCXX/inalloca-vector.cpp b/clang/test/CodeGenCXX/inalloca-vector.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/inalloca-vector.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -w -triple i686-pc-win32 -emit-llvm -o - %s | FileCheck %s
+
+// PR44395
+// MSVC passes up to three vectors in registers, and the rest indirectly. Check
+// that both are compatible with an inalloca prototype.
+
+struct NonTrivial {
+  NonTrivial();
+  NonTrivial(const NonTrivial &o);
+  unsigned handle;
+};
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+__m128 gv128;
+
+// nt, w, and q will be in the inalloca pack.
+void receive_vec_128(NonTrivial nt, __m128 x, __m128 y, __m128 z, __m128 w, __m128 q) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local void  @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@1111@Z"
+// CHECK-SAME: (<4 x float> inreg %x,
+// CHECK-SAME: <4 x float> inreg %y,
+// CHECK-SAME: <4 x float> inreg %z,
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %0)
+
+void pass_vec_128() {
+  __m128 z = {0};
+  receive_vec_128(NonTrivial(), z, z, z, z, z);
+}
+// CHECK-LABEL: define dso_local void @"?pass_vec_128@@YAXXZ"()
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 0
+// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* %{{.*}})
+
+// Store q, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 1
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// Store w, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 2
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// CHECK: call void @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@1111@Z"
+// CHECK-SAME: (<4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %{{[^,]*}})
+
+// w will be passed indirectly by register, and q will be passed indirectly, but
+// the pointer will be in memory.
+void __fastcall fastcall_receive_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q, NonTrivial nt) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local x86_fastcallcc void @"?fastcall_receive_vec@@Y{{[^"]*}}"
+// CHECK-SAME: (<4 x float> inreg %x,
+// CHECK-SAME: <4 x float> inreg %y,
+// CHECK-SAME: <4 x float> inreg %z,
+// CHECK-SAME: <4 x float>* inreg %0,
+// CHECK-SAME: i32 inreg %edx,
+// CHECK-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1)
+
+
+void __vectorcall vectorcall_receive_vec(double xmm0, double xmm1, double xmm2,
+                                         __m128 x, __m128 y, __m128 z,
+                                         __m128 w, int edx, __m128 q, NonTrivial nt) {
+  gv128 = x + y + z + w + q;
+}
+// FIXME: Enable these checks, clang generates wrong IR.
+// CHECK-LABEL: define dso_local x86_vectorcallcc void @"?vectorcall_receive_vec@@Y{{[^"]*}}"
+// CHECKX-SAME: (double inreg %xmm0,
+// CHECKX-SAME: double inreg %xmm1,
+// CHECKX-SAME: double inreg %xmm2,
+// CHECKX-SAME: <4 x float> inreg %x,
+// CHECKX-SAME: <4 x float> inreg %y,
+// CHECKX-SAME: <4 x float> inreg %z,
+// CHECKX-SAME: <4 x float>* inreg %0,
+// CHECKX-SAME: i32 inreg %edx,
+// CHECKX-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1)