diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -222,6 +222,11 @@
     // Calling convention between AArch64 Advanced SIMD functions
     AArch64_VectorCall = 97,
 
+    /// Calling convention for emscripten __invoke_* functions. The first
+    /// argument is required to be the function ptr being indirectly called.
+    /// The remainder matches the regular calling convention.
+    WASM_EmscriptenInvoke = 98,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -623,7 +623,8 @@
          CallConv == CallingConv::Cold ||
          CallConv == CallingConv::PreserveMost ||
          CallConv == CallingConv::PreserveAll ||
-         CallConv == CallingConv::CXX_FAST_TLS;
+         CallConv == CallingConv::CXX_FAST_TLS ||
+         CallConv == CallingConv::WASM_EmscriptenInvoke;
 }
 
 SDValue
@@ -682,6 +683,16 @@
 
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+
+  // The generic code may have added an sret argument. If we're lowering an
+  // invoke function, the ABI requires that the function pointer be the first
+  // argument, so we may have to swap the arguments.
+  if (CallConv == CallingConv::WASM_EmscriptenInvoke && Outs.size() >= 2 &&
+      Outs[0].Flags.isSRet()) {
+    std::swap(Outs[0], Outs[1]);
+    std::swap(OutVals[0], OutVals[1]);
+  }
+
   unsigned NumFixedArgs = 0;
   for (unsigned I = 0; I < Outs.size(); ++I) {
     const ISD::OutputArg &Out = Outs[I];
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -418,7 +418,7 @@
   Args.append(CI->arg_begin(), CI->arg_end());
   CallInst *NewCall = IRB.CreateCall(getInvokeWrapper(CI), Args);
   NewCall->takeName(CI);
-  NewCall->setCallingConv(CI->getCallingConv());
+  NewCall->setCallingConv(CallingConv::WASM_EmscriptenInvoke);
   NewCall->setDebugLoc(CI->getDebugLoc());
 
   // Because we added the pointer to the callee as first argument, all
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll
--- a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll
@@ -38,7 +38,7 @@
           to label %invoke.cont unwind label %lpad
 ; CHECK: entry:
 ; CHECK-NEXT: store i32 0, i32*
-; CHECK-NEXT: call void @__invoke_void(void ()* @foo)
+; CHECK-NEXT: void @__invoke_void(void ()* @foo)
 
 invoke.cont:                                      ; preds = %entry
   br label %try.cont
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
--- a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
@@ -16,7 +16,7 @@
           to label %invoke.cont unwind label %lpad
 ; CHECK: entry:
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
-; CHECK-NEXT: call void @__invoke_void_i32(void (i32)* @foo, i32 3)
+; CHECK-NEXT: void @__invoke_void_i32(void (i32)* @foo, i32 3)
 ; CHECK-NEXT: %[[__THREW__VAL:.*]] = load i32, i32* @__THREW__
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
 ; CHECK-NEXT: %cmp = icmp eq i32 %[[__THREW__VAL]], 1
@@ -72,7 +72,7 @@
           to label %invoke.cont unwind label %lpad
 ; CHECK: entry:
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
-; CHECK-NEXT: call void @__invoke_void_i32(void (i32)* @foo, i32 3)
+; CHECK-NEXT: void @__invoke_void_i32(void (i32)* @foo, i32 3)
 ; CHECK-NEXT: %[[__THREW__VAL:.*]] = load i32, i32* @__THREW__
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
 ; CHECK-NEXT: %cmp = icmp eq i32 %[[__THREW__VAL]], 1
@@ -123,7 +123,7 @@
           to label %invoke.cont unwind label %lpad
 ; CHECK: entry:
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
-; CHECK-NEXT: %0 = call noalias i8* @"__invoke_i8*_i8_i8"(i8* (i8, i8)* @bar, i8 signext 1, i8 zeroext 2)
+; CHECK-NEXT: %0 = call cc[[CC:[0-9]+]] noalias i8* @"__invoke_i8*_i8_i8"(i8* (i8, i8)* @bar, i8 signext 1, i8 zeroext 2)
 
 invoke.cont:                                      ; preds = %entry
   br label %try.cont
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-sret.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-sret.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-sret.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -asm-verbose=false -enable-emscripten-sjlj | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+%struct.__jmp_buf_tag = type { [6 x i32], i32, [32 x i32] }
+
+declare i32 @setjmp(%struct.__jmp_buf_tag*) #0
+declare {i32, i32} @returns_struct()
+
+; Test the combination of backend legalization of large return types and the
+; Emscripten sjlj transformation
+define hidden {i32, i32} @legalized_to_sret() {
+entry:
+  %env = alloca [1 x %struct.__jmp_buf_tag], align 16
+  %arraydecay = getelementptr inbounds [1 x %struct.__jmp_buf_tag], [1 x %struct.__jmp_buf_tag]* %env, i32 0, i32 0
+  %call = call i32 @setjmp(%struct.__jmp_buf_tag* %arraydecay) #0
+; This is the function pointer to pass to invoke.
+; It needs to be the first argument (that's what we're testing here)
+; CHECK:	i32.const	returns_struct
+; This is the sret stack region (as an offset from the stack pointer local)
+; CHECK-NEXT:	local.get
+; CHECK-NEXT:	i32.const
+; CHECK-NEXT:	i32.add
+; CHECK-NEXT:	call	"__invoke_{i32.i32}"
+  %ret = call {i32, i32} @returns_struct()
+  ret {i32, i32} %ret
+}
+
+attributes #0 = { returns_twice }
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -34,7 +34,7 @@
 ; CHECK-NEXT: phi i32 [ 0, %entry ], [ %[[LONGJMP_RESULT:.*]], %if.end ]
 ; CHECK-NEXT: %[[ARRAYDECAY1:.*]] = getelementptr inbounds [1 x %struct.__jmp_buf_tag], [1 x %struct.__jmp_buf_tag]* %[[BUF]], i32 0, i32 0
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
-; CHECK-NEXT: call void @"__invoke_void_%struct.__jmp_buf_tag*_i32"(void (%struct.__jmp_buf_tag*, i32)* @emscripten_longjmp_jmpbuf, %struct.__jmp_buf_tag* %[[ARRAYDECAY1]], i32 1)
+; CHECK-NEXT: void @"__invoke_void_%struct.__jmp_buf_tag*_i32"(void (%struct.__jmp_buf_tag*, i32)* @emscripten_longjmp_jmpbuf, %struct.__jmp_buf_tag* %[[ARRAYDECAY1]], i32 1)
 ; CHECK-NEXT: %[[__THREW__VAL:.*]] = load i32, i32* @__THREW__
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
 ; CHECK-NEXT: %[[CMP0:.*]] = icmp ne i32 %__THREW__.val, 0
@@ -85,7 +85,7 @@
 ; CHECK: %[[SETJMP_TABLE:.*]] = call i32* @saveSetjmp(
 
 ; CHECK: entry.split:
-; CHECK: call void @__invoke_void(void ()* @foo)
+; CHECK: @__invoke_void(void ()* @foo)
 
 ; CHECK: entry.split.split:
 ; CHECK-NEXT: %[[BUF:.*]] = bitcast i32* %[[SETJMP_TABLE]] to i8*
@@ -105,7 +105,7 @@
 
 ; CHECK: entry.split:
 ; CHECK: store i32 0, i32* @__THREW__
-; CHECK-NEXT: call void @__invoke_void(void ()* @foo)
+; CHECK-NEXT: void @__invoke_void(void ()* @foo)
 ; CHECK-NEXT: %[[__THREW__VAL:.*]] = load i32, i32* @__THREW__
 ; CHECK-NEXT: store i32 0, i32* @__THREW__
 ; CHECK-NEXT: %[[CMP0:.*]] = icmp ne i32 %[[__THREW__VAL]], 0