diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -61,13 +61,19 @@
 * The ``inalloca`` attribute now has a mandatory type field, similar
   to ``byval`` and ``sret``.
 
-
 Changes to building LLVM
 ------------------------
 
 Changes to TableGen
 -------------------
 
+Changes to Backend Code Generation
+----------------------------------
+
+* When lowering calls, only ABI attributes on the call itself are checked, not
+  the caller. Frontends need to make sure to properly set ABI attributes on
+  calls (and always should have).
+
 Changes to the ARM Backend
 --------------------------
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -102,28 +102,31 @@
   return true;
 }
 
-/// Set CallLoweringInfo attribute flags based on a call instruction
-/// and called function attributes.
+/// Set CallLoweringInfo attribute flags based on the call instruction's
+/// argument attributes.
 void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
                                                      unsigned ArgIdx) {
-  IsSExt = Call->paramHasAttr(ArgIdx, Attribute::SExt);
-  IsZExt = Call->paramHasAttr(ArgIdx, Attribute::ZExt);
-  IsInReg = Call->paramHasAttr(ArgIdx, Attribute::InReg);
-  IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet);
-  IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest);
-  IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal);
-  IsPreallocated = Call->paramHasAttr(ArgIdx, Attribute::Preallocated);
-  IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca);
-  IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned);
-  IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
-  IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
-  Alignment = Call->getParamStackAlign(ArgIdx);
+  auto Attrs = Call->getAttributes();
+
+  IsSExt = Attrs.hasParamAttribute(ArgIdx, Attribute::SExt);
+  IsZExt = Attrs.hasParamAttribute(ArgIdx, Attribute::ZExt);
+  IsInReg = Attrs.hasParamAttribute(ArgIdx, Attribute::InReg);
+  IsSRet = Attrs.hasParamAttribute(ArgIdx, Attribute::StructRet);
+  IsNest = Attrs.hasParamAttribute(ArgIdx, Attribute::Nest);
+  IsReturned = Attrs.hasParamAttribute(ArgIdx, Attribute::Returned);
+  IsSwiftSelf = Attrs.hasParamAttribute(ArgIdx, Attribute::SwiftSelf);
+  IsSwiftError = Attrs.hasParamAttribute(ArgIdx, Attribute::SwiftError);
+  Alignment = Attrs.getParamStackAlignment(ArgIdx);
+
+  IsByVal = Attrs.hasParamAttribute(ArgIdx, Attribute::ByVal);
   ByValType = nullptr;
   if (IsByVal) {
     ByValType = Call->getParamByValType(ArgIdx);
     if (!Alignment)
       Alignment = Call->getParamAlign(ArgIdx);
   }
+  IsInAlloca = Attrs.hasParamAttribute(ArgIdx, Attribute::InAlloca);
+  IsPreallocated = Attrs.hasParamAttribute(ArgIdx, Attribute::Preallocated);
   PreallocatedType = nullptr;
   if (IsPreallocated)
     PreallocatedType = Call->getParamPreallocatedType(ArgIdx);
diff --git a/llvm/test/CodeGen/AArch64/arm64-this-return.ll b/llvm/test/CodeGen/AArch64/arm64-this-return.ll
--- a/llvm/test/CodeGen/AArch64/arm64-this-return.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-this-return.ll
@@ -38,9 +38,9 @@
 ; CHECK-NOT: mov x0, {{x[0-9]+}}
 ; CHECK: b {{_?B_ctor_base}}
   %0 = bitcast %struct.C* %this to %struct.A*
-  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+  %call = tail call %struct.A* @A_ctor_base(%struct.A* returned %0)
   %1 = getelementptr inbounds %struct.C, %struct.C* %this, i32 0, i32 0
-  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* returned %1, i32 %x)
   ret %struct.C* %this
 }
 
@@ -88,7 +88,7 @@
 entry:
 ; CHECK-LABEL: C_ctor_complete:
 ; CHECK: b {{_?C_ctor_base}}
-  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+  %call = tail call %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x)
   ret %struct.C* %this
 }
 
@@ -135,8 +135,8 @@
 ; CHECK-NOT: mov x0, {{x[0-9]+}}
 ; CHECK: b {{_?B_ctor_complete}}
   %b = getelementptr inbounds %struct.D, %struct.D* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b, i32 %x)
   ret %struct.D* %this
 }
 
@@ -166,8 +166,8 @@
 ; CHECK-LABEL: E_ctor_base:
 ; CHECK-NOT: b {{_?B_ctor_complete}}
   %b = getelementptr inbounds %struct.E, %struct.E* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b, i32 %x)
   %b2 = getelementptr inbounds %struct.E, %struct.E* %this, i32 0, i32 1
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b2, i32 %x)
   ret %struct.E* %this
 }
diff --git a/llvm/test/CodeGen/AArch64/bitfield-extract.ll b/llvm/test/CodeGen/AArch64/bitfield-extract.ll
--- a/llvm/test/CodeGen/AArch64/bitfield-extract.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-extract.ll
@@ -91,7 +91,7 @@
 define void @test11(i64 %a) {
   %tmp = lshr i64 %a, 23
   %res = trunc i64 %tmp to i16
-  call void @use(i16 %res, i64 %tmp)
+  call void @use(i16 signext %res, i64 %tmp)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/llvm/test/CodeGen/AArch64/tailcall-explicit-sret.ll
--- a/llvm/test/CodeGen/AArch64/tailcall-explicit-sret.ll
+++ b/llvm/test/CodeGen/AArch64/tailcall-explicit-sret.ll
@@ -11,7 +11,7 @@
 ; CHECK-LABEL: _test_tailcall_explicit_sret:
 ; CHECK-NEXT: b _test_explicit_sret
 define void @test_tailcall_explicit_sret(i1024* sret(i1024) %arg) #0 {
-  tail call void @test_explicit_sret(i1024* %arg)
+  tail call void @test_explicit_sret(i1024* sret(i1024) %arg)
   ret void
 }
 
@@ -20,7 +20,7 @@
 ; CHECK: bl _test_explicit_sret
 ; CHECK: ret
 define void @test_call_explicit_sret(i1024* sret(i1024) %arg) #0 {
-  call void @test_explicit_sret(i1024* %arg)
+  call void @test_explicit_sret(i1024* sret(i1024) %arg)
   ret void
 }
 
@@ -30,7 +30,7 @@
 ; CHECK: ret
 define void @test_tailcall_explicit_sret_alloca_unused() #0 {
   %l = alloca i1024, align 8
-  tail call void @test_explicit_sret(i1024* %l)
+  tail call void @test_explicit_sret(i1024* sret(i1024) %l)
   ret void
 }
 
@@ -44,7 +44,7 @@
   %l = alloca i1024, align 8
   %r = load i1024, i1024* %ptr, align 8
   store i1024 %r, i1024* %l, align 8
-  tail call void @test_explicit_sret(i1024* %l)
+  tail call void @test_explicit_sret(i1024* sret(i1024) %l)
   ret void
 }
 
@@ -56,7 +56,7 @@
 ; CHECK: ret
 define void @test_tailcall_explicit_sret_gep(i1024* %ptr) #0 {
   %ptr2 = getelementptr i1024, i1024* %ptr, i32 1
-  tail call void @test_explicit_sret(i1024* %ptr2)
+  tail call void @test_explicit_sret(i1024* sret(i1024) %ptr2)
   ret void
 }
 
@@ -69,7 +69,7 @@
 ; CHECK: ret
 define i1024 @test_tailcall_explicit_sret_alloca_returned() #0 {
   %l = alloca i1024, align 8
-  tail call void @test_explicit_sret(i1024* %l)
+  tail call void @test_explicit_sret(i1024* sret(i1024) %l)
   %r = load i1024, i1024* %l, align 8
   ret i1024 %r
 }
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -92,7 +92,7 @@
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   %var = load volatile i1, i1 addrspace(1)* undef
-  call void @external_void_func_i1_signext(i1 %var)
+  call void @external_void_func_i1_signext(i1 signext %var)
   ret void
 }
 
@@ -113,7 +113,7 @@
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   %var = load volatile i1, i1 addrspace(1)* undef
-  call void @external_void_func_i1_zeroext(i1 %var)
+  call void @external_void_func_i1_zeroext(i1 zeroext %var)
   ret void
 }
 
@@ -148,7 +148,7 @@
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
   %var = load volatile i8, i8 addrspace(1)* undef
-  call void @external_void_func_i8_signext(i8 %var)
+  call void @external_void_func_i8_signext(i8 signext %var)
   ret void
 }
 
@@ -166,7 +166,7 @@
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
   %var = load volatile i8, i8 addrspace(1)* undef
-  call void @external_void_func_i8_zeroext(i8 %var)
+  call void @external_void_func_i8_zeroext(i8 zeroext %var)
   ret void
 }
 
@@ -195,7 +195,7 @@
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
   %var = load volatile i16, i16 addrspace(1)* undef
-  call void @external_void_func_i16_signext(i16 %var)
+  call void @external_void_func_i16_signext(i16 signext %var)
   ret void
 }
 
@@ -212,7 +212,7 @@
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
   %var = load volatile i16, i16 addrspace(1)* undef
-  call void @external_void_func_i16_zeroext(i16 %var)
+  call void @external_void_func_i16_zeroext(i16 zeroext %var)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -517,7 +517,7 @@
     i32 210, i32 220, i32 230, i32 240,
     i32 250, i32 260, i32 270, i32 280,
     i32 290, i32 300, i32 310, i32 320,
-    i32 addrspace(5)* %alloca)
+    i32 addrspace(5)* byval(i32) %alloca)
   ret void
 }
 
@@ -541,7 +541,7 @@
     i32 210, i32 220, i32 230, i32 240,
     i32 250, i32 260, i32 270, i32 280,
     i32 290, i32 300, i32 310, i32 320,
-    i32 addrspace(5)* %alloca)
+    i32 addrspace(5)* byval(i32) %alloca)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -649,7 +649,7 @@
     i32 210, i32 220, i32 230, i32 240,
     i32 250, i32 260, i32 270, i32 280,
     i32 290, i32 300, i32 310, i32 320,
-    i32 addrspace(5)* %alloca)
+    i32 addrspace(5)* byval(i32) %alloca)
   ret void
 }
 
@@ -686,7 +686,7 @@
     i32 210, i32 220, i32 230, i32 240,
     i32 250, i32 260, i32 270, i32 280,
     i32 290, i32 300, i32 310, i32 320,
-    i32 addrspace(5)* %alloca)
+    i32 addrspace(5)* byval(i32) %alloca)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -214,7 +214,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %var = load volatile i1, i1 addrspace(1)* undef
-  call amdgpu_gfx void @external_void_func_i1_signext(i1 %var)
+  call amdgpu_gfx void @external_void_func_i1_signext(i1 signext%var)
   ret void
 }
 
@@ -280,7 +280,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %var = load volatile i1, i1 addrspace(1)* undef
-  call amdgpu_gfx void @external_void_func_i1_zeroext(i1 %var)
+  call amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext %var)
   ret void
 }
 
@@ -401,7 +401,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %var = load volatile i8, i8 addrspace(1)* undef
-  call amdgpu_gfx void @external_void_func_i8_signext(i8 %var)
+  call amdgpu_gfx void @external_void_func_i8_signext(i8 signext %var)
   ret void
 }
 
@@ -463,7 +463,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %var = load volatile i8, i8 addrspace(1)* undef
-  call amdgpu_gfx void @external_void_func_i8_zeroext(i8 %var)
+  call amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext %var)
   ret void
 }
 
@@ -584,7 +584,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %var = load volatile i16, i16 addrspace(1)* undef
-  call amdgpu_gfx void @external_void_func_i16_signext(i16 %var)
+  call amdgpu_gfx void @external_void_func_i16_signext(i16 signext %var)
   ret void
 }
 
@@ -646,7 +646,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %var = load volatile i16, i16 addrspace(1)* undef
-  call amdgpu_gfx void @external_void_func_i16_zeroext(i16 %var)
+  call amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext %var)
   ret void
 }
 
@@ -3081,7 +3081,7 @@
   %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1
   store i8 3, i8 addrspace(5)* %gep0
   store i32 8, i32 addrspace(5)* %gep1
-  call amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %val)
+  call amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %val)
   ret void
 }
 
@@ -3173,7 +3173,7 @@
   %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1
   store i8 3, i8 addrspace(5)* %in.gep0
   store i32 8, i32 addrspace(5)* %in.gep1
-  call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val)
+  call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i32 }) %out.val, { i8, i32 } addrspace(5)* byval({ i8, i32 }) %in.val)
   %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0
   %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1
   %out.val0 = load i8, i8 addrspace(5)* %out.gep0
@@ -3383,7 +3383,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_i1_inreg(i1 true)
+  call amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg true)
   ret void
 }
 
@@ -3442,7 +3442,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_i8_inreg(i8 123)
+  call amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg 123)
   ret void
 }
 
@@ -3501,7 +3501,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_i16_inreg(i16 123)
+  call amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg 123)
   ret void
 }
 
@@ -3560,7 +3560,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_i32_inreg(i32 42)
+  call amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg 42)
   ret void
 }
 
@@ -3621,7 +3621,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_i64_inreg(i64 123)
+  call amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg 123)
   ret void
 }
 
@@ -3683,7 +3683,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <2 x i64>, <2 x i64> addrspace(4)* null
-  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> %val)
+  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg %val)
   ret void
 }
 
@@ -3748,7 +3748,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> <i64 8589934593, i64 17179869187>)
+  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg <i64 8589934593, i64 17179869187>)
   ret void
 }
 
@@ -3816,7 +3816,7 @@
   %load = load <2 x i64>, <2 x i64> addrspace(4)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
 
-  call amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> %val)
+  call amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> inreg %val)
   ret void
 }
 
@@ -3887,7 +3887,7 @@
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %load = load <2 x i64>, <2 x i64> addrspace(4)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> %val)
+  call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg %val)
   ret void
 }
 
@@ -3946,7 +3946,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_f16_inreg(half 4.0)
+  call amdgpu_gfx void @external_void_func_f16_inreg(half inreg 4.0)
   ret void
 }
 
@@ -4005,7 +4005,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_f32_inreg(float 4.0)
+  call amdgpu_gfx void @external_void_func_f32_inreg(float inreg 4.0)
   ret void
 }
 
@@ -4066,7 +4066,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> <float 1.0, float 2.0>)
+  call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg <float 1.0, float 2.0>)
   ret void
 }
 
@@ -4129,7 +4129,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> <float 1.0, float 2.0, float 4.0>)
+  call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg <float 1.0, float 2.0, float 4.0>)
   ret void
 }
 
@@ -4196,7 +4196,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
+  call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
   ret void
 }
 
@@ -4257,7 +4257,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_f64_inreg(double 4.0)
+  call amdgpu_gfx void @external_void_func_f64_inreg(double inreg 4.0)
   ret void
 }
 
@@ -4322,7 +4322,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> <double 2.0, double 4.0>)
+  call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg <double 2.0, double 4.0>)
   ret void
 }
 
@@ -4391,7 +4391,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> <double 2.0, double 4.0, double 8.0>)
+  call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg <double 2.0, double 4.0, double 8.0>)
   ret void
 }
 
@@ -4451,7 +4451,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <2 x i16>, <2 x i16> addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> %val)
+  call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg %val)
   ret void
 }
 
@@ -4511,7 +4511,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <3 x i16>, <3 x i16> addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> %val)
+  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg %val)
   ret void
 }
 
@@ -4571,7 +4571,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <3 x half>, <3 x half> addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> %val)
+  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg %val)
   ret void
 }
 
@@ -4632,7 +4632,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> <i16 1, i16 2, i16 3>)
+  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg <i16 1, i16 2, i16 3>)
   ret void
 }
 
@@ -4693,7 +4693,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> <half 1.0, half 2.0, half 4.0>)
+  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg <half 1.0, half 2.0, half 4.0>)
   ret void
 }
 
@@ -4753,7 +4753,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <4 x i16>, <4 x i16> addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> %val)
+  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg %val)
   ret void
 }
 
@@ -4814,7 +4814,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
+  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg <i16 1, i16 2, i16 3, i16 4>)
   ret void
 }
 
@@ -4874,7 +4874,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <2 x half>, <2 x half> addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> %val)
+  call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg %val)
   ret void
 }
 
@@ -4934,7 +4934,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <2 x i32>, <2 x i32> addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> %val)
+  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg %val)
   ret void
 }
 
@@ -4995,7 +4995,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> <i32 1, i32 2>)
+  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg <i32 1, i32 2>)
   ret void
 }
 
@@ -5058,7 +5058,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> <i32 3, i32 4, i32 5>)
+  call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg <i32 3, i32 4, i32 5>)
   ret void
 }
 
@@ -5123,7 +5123,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
+  call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg <i32 3, i32 4, i32 5>, i32 inreg 6)
   ret void
 }
 
@@ -5183,7 +5183,7 @@
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %val = load <4 x i32>, <4 x i32> addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> %val)
+  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg %val)
   ret void
 }
 
@@ -5248,7 +5248,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg <i32 1, i32 2, i32 3, i32 4>)
   ret void
 }
 
@@ -5315,7 +5315,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
+  call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg <i32 1, i32 2, i32 3, i32 4, i32 5>)
   ret void
 }
 
@@ -5380,7 +5380,7 @@
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %ptr = load <8 x i32> addrspace(4)*, <8 x i32> addrspace(4)* addrspace(4)* undef
   %val = load <8 x i32>, <8 x i32> addrspace(4)* %ptr
-  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> %val)
+  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg %val)
   ret void
 }
 
@@ -5453,7 +5453,7 @@
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
-  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
+  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
   ret void
 }
 
@@ -5518,7 +5518,7 @@
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %ptr = load <16 x i32> addrspace(4)*, <16 x i32> addrspace(4)* addrspace(4)* undef
   %val = load <16 x i32>, <16 x i32> addrspace(4)* %ptr
-  call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> %val)
+  call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg %val)
   ret void
 }
 
@@ -5696,7 +5696,7 @@
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
   %ptr = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
   %val = load <32 x i32>, <32 x i32> addrspace(4)* %ptr
-  call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> %val)
+  call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg %val)
   ret void
 }
 
@@ -5882,7 +5882,7 @@
   %ptr0 = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
   %val0 = load <32 x i32>, <32 x i32> addrspace(4)* %ptr0
   %val1 = load i32, i32 addrspace(4)* undef
-  call amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> %val0, i32 %val1)
+  call amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> inreg %val0, i32 inreg %val1)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
--- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -23,6 +23,6 @@
 ; GCN-NEXT:    s_addc_u32 s7, s7, callee@rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[6:7]
   %add = fadd float %arg0, 1.0
-  %call = tail call amdgpu_gfx float @callee(float %add, float 2.0)
+  %call = tail call amdgpu_gfx float @callee(float %add, float inreg 2.0)
   ret float %call
 }
diff --git a/llvm/test/CodeGen/ARM/ipra-r0-returned.ll b/llvm/test/CodeGen/ARM/ipra-r0-returned.ll
--- a/llvm/test/CodeGen/ARM/ipra-r0-returned.ll
+++ b/llvm/test/CodeGen/ARM/ipra-r0-returned.ll
@@ -13,6 +13,6 @@
 ; CHECK-NOT: r0
 ; CHECK: bl      returns_r0
 ; CHECK-NOT: r0
-  %b = call i32 @returns_r0(i32 %a)
+  %b = call i32 @returns_r0(i32 returned %a)
   ret i32 %a
 }
diff --git a/llvm/test/CodeGen/ARM/returned-ext.ll b/llvm/test/CodeGen/ARM/returned-ext.ll
--- a/llvm/test/CodeGen/ARM/returned-ext.ll
+++ b/llvm/test/CodeGen/ARM/returned-ext.ll
@@ -22,9 +22,9 @@
 ; CHECKT2D: uxth r0, r0
 ; CHECKT2D: bl _identity32
 ; CHECKT2D: mov r0, [[SAVEX]]
-  %call = tail call i16 @identity16(i16 %x)
+  %call = tail call i16 @identity16(i16 returned %x)
   %b = zext i16 %call to i32
-  %call2 = tail call i32 @identity32(i32 %b)
+  %call2 = tail call i32 @identity32(i32 returned %b)
   ret i16 %x
 }
 
@@ -56,9 +56,9 @@
 ; This shouldn't be required
 ; CHECKT2D: mov r0, [[SAVEX]]
 
-  %call = tail call i16 @retzext16(i16 %x)
+  %call = tail call i16 @retzext16(i16 returned %x)
   %b = zext i16 %call to i32
-  %call2 = tail call i32 @identity32(i32 %b)
+  %call2 = tail call i32 @identity32(i32 returned %b)
   ret i16 %x
 }
 
@@ -76,9 +76,9 @@
 ; CHECKT2D: sxth r0, {{r[0-9]+}}
 ; CHECKT2D: bl _identity32
 ; CHECKT2D: mov r0, [[SAVEX]]
-  %call = tail call i16 @retzext16(i16 %x)
+  %call = tail call i16 @retzext16(i16 returned %x)
   %b = sext i16 %call to i32
-  %call2 = tail call i32 @identity32(i32 %b)
+  %call2 = tail call i32 @identity32(i32 returned %b)
   ret i16 %x
 }
 
@@ -96,10 +96,10 @@
 ; CHECKT2D: uxth r0, r0
 ; CHECKT2D: bl _identity32
 ; CHECKT2D: b.w _paramzext16
-  %call = tail call i16 @paramzext16(i16 %x)
+  %call = tail call i16 @paramzext16(i16 zeroext returned %x)
   %b = zext i16 %call to i32
-  %call2 = tail call i32 @identity32(i32 %b)
-  %call3 = tail call i16 @paramzext16(i16 %call)
+  %call2 = tail call i32 @identity32(i32 returned %b)
+  %call3 = tail call i16 @paramzext16(i16 zeroext returned %call)
   ret i16 %call3
 }
 
@@ -121,13 +121,13 @@
 ; CHECKT2D: bl _paramzext16
 ; CHECKT2D: bl _identity32
 ; CHECKT2D: b.w _paramzext16
-  %call = tail call i16 @paramzext16(i16 %x)
+  %call = tail call i16 @paramzext16(i16 zeroext returned %x)
 
 ; Should make no difference if %x is used below rather than %call, but it does
   %b = zext i16 %x to i32
 
   %call2 = tail call i32 @identity32(i32 %b)
-  %call3 = tail call i16 @paramzext16(i16 %call)
+  %call3 = tail call i16 @paramzext16(i16 zeroext returned %call)
   ret i16 %call3
 }
 
@@ -149,9 +149,9 @@
 ; FIXME: Tail call should be OK here
 ; CHECKT2D: bl _identity32
 
-  %call = tail call i16 @bothzext16(i16 %x)
+  %call = tail call i16 @bothzext16(i16 zeroext returned %x)
   %b = zext i16 %x to i32
-  %call2 = tail call i32 @identity32(i32 %b)
+  %call2 = tail call i32 @identity32(i32 returned %b)
   ret i16 %call
 }
 
@@ -171,8 +171,8 @@
 ; CHECKT2D: sxth r0, [[SAVEX]]
 ; CHECKT2D: bl _identity32
 ; CHECKT2D: mov r0, [[SAVEX]]
-  %call = tail call i16 @bothzext16(i16 %x)
+  %call = tail call i16 @bothzext16(i16 zeroext returned %x)
   %b = sext i16 %x to i32
-  %call2 = tail call i32 @identity32(i32 %b)
+  %call2 = tail call i32 @identity32(i32 returned %b)
   ret i16 %x
 }
diff --git a/llvm/test/CodeGen/ARM/this-return.ll b/llvm/test/CodeGen/ARM/this-return.ll
--- a/llvm/test/CodeGen/ARM/this-return.ll
+++ b/llvm/test/CodeGen/ARM/this-return.ll
@@ -28,9 +28,9 @@
 ; CHECKT2D-NOT: mov r0, {{r[0-9]+}}
 ; CHECKT2D: b.w _B_ctor_base
   %0 = bitcast %struct.C* %this to %struct.A*
-  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+  %call = tail call %struct.A* @A_ctor_base(%struct.A* returned %0)
   %1 = getelementptr inbounds %struct.C, %struct.C* %this, i32 0, i32 0
-  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* returned %1, i32 %x)
   ret %struct.C* %this
 }
 
@@ -59,7 +59,7 @@
 ; CHECKELF: b C_ctor_base
 ; CHECKT2D-LABEL: C_ctor_complete:
 ; CHECKT2D: b.w _C_ctor_base
-  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+  %call = tail call %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x)
   ret %struct.C* %this
 }
 
@@ -86,8 +86,8 @@
 ; CHECKT2D-NOT: mov r0, {{r[0-9]+}}
 ; CHECKT2D: b.w _B_ctor_complete
   %b = getelementptr inbounds %struct.D, %struct.D* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b, i32 %x)
   ret %struct.D* %this
 }
 
@@ -98,8 +98,8 @@
 ; CHECKT2D-LABEL: E_ctor_base:
 ; CHECKT2D-NOT: b.w _B_ctor_complete
   %b = getelementptr inbounds %struct.E, %struct.E* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b, i32 %x)
   %b2 = getelementptr inbounds %struct.E, %struct.E* %this, i32 0, i32 1
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* returned %b2, i32 %x)
   ret %struct.E* %this
 }
diff --git a/llvm/test/CodeGen/SPARC/64abi.ll b/llvm/test/CodeGen/SPARC/64abi.ll
--- a/llvm/test/CodeGen/SPARC/64abi.ll
+++ b/llvm/test/CodeGen/SPARC/64abi.ll
@@ -50,7 +50,7 @@
 ; CHECK-NOT: add %sp
 ; CHECK: restore
 define void @call_intarg(i32 %i0, i8* %i1) {
-  call void @intarg(i8 0, i8 1, i16 2, i32 3, i8* undef, i32 5, i32 %i0, i8* %i1)
+  call void @intarg(i8 0, i8 1, i16 2, i32 3, i8* undef, i32 5, i32 signext %i0, i8* %i1)
   ret void
 }
 
@@ -222,7 +222,7 @@
 ; SOFT:  or %i1, %i0, %o0
 ; CHECK: call inreg_fi
 define void @call_inreg_fi(i32* %p, i32 %i1, float %f5) {
-  %x = call i32 @inreg_fi(i32 %i1, float %f5)
+  %x = call i32 @inreg_fi(i32 inreg %i1, float inreg %f5)
   ret void
 }
 
@@ -245,7 +245,7 @@
 ; SOFT: or %i1, %i0, %o0
 ; CHECK: call inreg_ff
 define void @call_inreg_ff(i32* %p, float %f3, float %f5) {
-  %x = call float @inreg_ff(float %f3, float %f5)
+  %x = call float @inreg_ff(float inreg %f3, float inreg %f5)
   ret void
 }
 
@@ -269,7 +269,7 @@
 ; SOFT: or %i1, %i0, %o0
 ; CHECK: call inreg_if
 define void @call_inreg_if(i32* %p, float %f3, i32 %i2) {
-  %x = call i32 @inreg_if(float %f3, i32 %i2)
+  %x = call i32 @inreg_if(float inreg %f3, i32 inreg %i2)
   ret void
 }
 
@@ -289,7 +289,7 @@
 ; CHECK: or [[R1]], [[R2]], %o0
 ; CHECK: call inreg_ii
 define void @call_inreg_ii(i32* %p, i32 %i1, i32 %i2) {
-  %x = call i32 @inreg_ii(i32 %i1, i32 %i2)
+  %x = call i32 @inreg_ii(i32 inreg %i1, i32 inreg %i2)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SystemZ/args-02.ll b/llvm/test/CodeGen/SystemZ/args-02.ll
--- a/llvm/test/CodeGen/SystemZ/args-02.ll
+++ b/llvm/test/CodeGen/SystemZ/args-02.ll
@@ -66,9 +66,9 @@
 ; CHECK-STACK: mvghi 160(%r15), -5
 ; CHECK-STACK: brasl %r14, bar@PLT
 
-  call void @bar (i8 -1, i16 -2, i32 -3, i64 -4, float 0.0, double 0.0,
+  call void @bar (i8 signext -1, i16 signext -2, i32 signext -3, i64 -4, float 0.0, double 0.0,
                   fp128 0xL00000000000000000000000000000000, i64 -5,
-                  float -0.0, double -0.0, i8 -6, i16 -7, i32 -8, i64 -9,
+                  float -0.0, double -0.0, i8 signext -6, i16 signext -7, i32 signext -8, i64 -9,
                   float 0.0, double 0.0,
                   fp128 0xL00000000000000000000000000000000)
   ret void
diff --git a/llvm/test/CodeGen/SystemZ/args-03.ll b/llvm/test/CodeGen/SystemZ/args-03.ll
--- a/llvm/test/CodeGen/SystemZ/args-03.ll
+++ b/llvm/test/CodeGen/SystemZ/args-03.ll
@@ -68,9 +68,9 @@
 ; CHECK-STACK: mvghi 160(%r15), -5
 ; CHECK-STACK: brasl %r14, bar@PLT
 
-  call void @bar (i8 -1, i16 -2, i32 -3, i64 -4, float 0.0, double 0.0,
+  call void @bar (i8 zeroext -1, i16 zeroext -2, i32 zeroext -3, i64 -4, float 0.0, double 0.0,
                   fp128 0xL00000000000000000000000000000000, i64 -5,
-                  float -0.0, double -0.0, i8 -6, i16 -7, i32 -8, i64 -9,
+                  float -0.0, double -0.0, i8 zeroext -6, i16 zeroext -7, i32 zeroext -8, i64 -9,
                   float 0.0, double 0.0,
                   fp128 0xL00000000000000000000000000000000)
   ret void
diff --git a/llvm/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/llvm/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
--- a/llvm/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/llvm/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -7,7 +7,7 @@
 
 define x86_fastcallcc void @caller(i32, i64) {
         %X = alloca i32         ; <i32*> [#uses=1]
-        call x86_fastcallcc void @func( i32* %X, i64 0 )
+        call x86_fastcallcc void @func( i32* %X, i64 inreg 0 )
         ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/fast-cc-pass-in-regs.ll b/llvm/test/CodeGen/X86/fast-cc-pass-in-regs.ll
--- a/llvm/test/CodeGen/X86/fast-cc-pass-in-regs.ll
+++ b/llvm/test/CodeGen/X86/fast-cc-pass-in-regs.ll
@@ -4,7 +4,7 @@
 declare x86_fastcallcc i64 @callee(i64 inreg)
 
 define i64 @caller() {
-        %X = call x86_fastcallcc  i64 @callee( i64 4294967299 )          ; <i64> [#uses=1]
+        %X = call x86_fastcallcc  i64 @callee( i64 inreg 4294967299 )          ; <i64> [#uses=1]
 ; CHECK: mov{{.*}}edx, 1
         ret i64 %X
 }
diff --git a/llvm/test/CodeGen/X86/mismatched-byval.ll b/llvm/test/CodeGen/X86/mismatched-byval.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mismatched-byval.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
+
+; This tests that we only look at the call site for ABI attributes, so f and f2 should codegen differently
+
+define void @b(i8* byval(i8) %p) {
+; CHECK-LABEL: b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+    ret void
+}
+
+define void @f(i8 %p) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-NEXT:    movb %al, (%rsp)
+; CHECK-NEXT:    callq b@PLT
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+    %a = alloca i8
+    ;store i8 %p, i8* %a
+    call void @b(i8* byval(i8) %a)
+    ret void
+}
+
+define void @f2(i8 %p) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    callq b@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+    %a = alloca i8
+    ;store i8 %p, i8* %a
+    call void @b(i8* %a)
+    ret void
+}
+
diff --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll
--- a/llvm/test/CodeGen/X86/movtopush.ll
+++ b/llvm/test/CodeGen/X86/movtopush.ll
@@ -107,7 +107,7 @@
 ; NORMAL-NEXT: addl $12, %esp
 define void @test4() optsize {
 entry:
-  call void @inreg(i32 1, i32 2, i32 3, i32 4)
+  call void @inreg(i32 1, i32 inreg 2, i32 3, i32 4)
   ret void
 }
 
@@ -307,9 +307,9 @@
 define void @test12() optsize {
 entry:
   %s = alloca %struct.s, align 4
-  call void @struct(%struct.s* %s, i32 2, i32 3, i32 4)
+  call void @struct(%struct.s* byval(%struct.s) %s, i32 2, i32 3, i32 4)
   call void @good(i32 5, i32 6, i32 7, i32 8)
-  call void @struct(%struct.s* %s, i32 10, i32 11, i32 12)
+  call void @struct(%struct.s* byval(%struct.s) %s, i32 10, i32 11, i32 12)
   ret void
 }
 
@@ -340,7 +340,7 @@
 entry:
   %s = alloca %struct.s, align 4
   call void @good(i32 1, i32 2, i32 3, i32 4)
-  call void @struct(%struct.s* %s, i32 6, i32 7, i32 8)
+  call void @struct(%struct.s* byval(%struct.s) %s, i32 6, i32 7, i32 8)
   call void @good(i32 9, i32 10, i32 11, i32 12)
   ret void
 }
@@ -413,7 +413,7 @@
   %0 = bitcast %struct.A* %a to i64*
   %1 = load i64, i64* %0, align 4
   store i64 %1, i64* %agg.tmp, align 4
-  %call = call x86_thiscallcc %struct.B* @B_ctor(%struct.B* %ref.tmp, %struct.A* byval(%struct.A) %tmpcast)
+  %call = call x86_thiscallcc %struct.B* @B_ctor(%struct.B* returned %ref.tmp, %struct.A* byval(%struct.A) %tmpcast)
   %2 = getelementptr inbounds %struct.B, %struct.B* %tmp, i32 0, i32 0
   call void @B_func(%struct.B* sret(%struct.B) %tmp, %struct.B* %ref.tmp, i32 1)
   ret void
diff --git a/llvm/test/CodeGen/X86/pop-stack-cleanup.ll b/llvm/test/CodeGen/X86/pop-stack-cleanup.ll
--- a/llvm/test/CodeGen/X86/pop-stack-cleanup.ll
+++ b/llvm/test/CodeGen/X86/pop-stack-cleanup.ll
@@ -60,7 +60,7 @@
 ; CHECK-DAG: movl {{.*}}, %edx
 ; CHECK: calll _spill
   %i = call i32 @param2_ret(i32 1, i32 2)
-  call void @spill(i32 %a, i32 %b, i32 %c)
+  call void @spill(i32 inreg %a, i32 inreg %b, i32 inreg %c)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/preallocated.ll b/llvm/test/CodeGen/X86/preallocated.ll
--- a/llvm/test/CodeGen/X86/preallocated.ll
+++ b/llvm/test/CodeGen/X86/preallocated.ll
@@ -129,11 +129,11 @@
 ; CHECK: pushl [[REGISTER2]]
 ; CHECK: calll _init
 
-  call void @foo_ret_p(%Foo* %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t2)]
+  call void @foo_ret_p(%Foo* sret(%Foo) %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t2)]
 ; CHECK-NOT: subl {{\$[0-9]+}}, %esp
 ; CHECK-NOT: pushl
 ; CHECK: calll _foo_ret_p
-  call void @foo_ret_p(%Foo* %tmp, %Foo* preallocated(%Foo) %b1) ["preallocated"(token %t1)]
+  call void @foo_ret_p(%Foo* sret(%Foo) %tmp, %Foo* preallocated(%Foo) %b1) ["preallocated"(token %t1)]
 ; CHECK-NOT: subl {{\$[0-9]+}}, %esp
 ; CHECK-NOT: pushl
 ; CHECK: calll _foo_ret_p
@@ -150,7 +150,7 @@
 ; CHECK: subl $8, %esp
 ; CHECK: movl $9, %eax
 ; CHECK: calll _foo_inreg_p
-  call void @foo_inreg_p(i32 9, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  call void @foo_inreg_p(i32 inreg 9, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/tailcall-msvc-conventions.ll b/llvm/test/CodeGen/X86/tailcall-msvc-conventions.ll
--- a/llvm/test/CodeGen/X86/tailcall-msvc-conventions.ll
+++ b/llvm/test/CodeGen/X86/tailcall-msvc-conventions.ll
@@ -181,7 +181,7 @@
 
 declare x86_fastcallcc void @fastcall2(i32 inreg %a, i32 inreg %b)
 define void @cdecl_fastcall_tail(i32 %a, i32 %b) {
-  tail call x86_fastcallcc void @fastcall2(i32 %a, i32 %b)
+  tail call x86_fastcallcc void @fastcall2(i32 inreg %a, i32 inreg %b)
   ret void
 }
 ; fastcall2 won't pop anything.