Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -5044,7 +5044,7 @@
   the operand. (The behavior for relocatable symbol expressions is a
   target-specific behavior for this typically target-independent modifier)
 - ``H``: Print a memory reference with additional offset +8.
-- ``P``: Print a memory reference or operand for use as the argument of a call
+- ``P``: Print branch target operand for use as the argument of a call
   instruction. (E.g. omit ``(rip)``, even though it's PC-relative.)
 
 XCore:
Index: llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/CodeGen.h"
 #include <memory>
@@ -28,6 +29,7 @@
 namespace llvm {
 
 class CallLowering;
+class GlobalValue;
 class InlineAsmLowering;
 class InstrItineraryData;
 struct InstrStage;
@@ -312,6 +314,11 @@
                                            unsigned PhysReg) const {
     return false;
   }
+
+  virtual unsigned char
+  classifyGlobalFunctionReference(const GlobalValue *GV) const {
+    return 0;
+  }
 };
 
 } // end namespace llvm
Index: llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -1137,11 +1137,135 @@
     TLI->AdjustInstrPostInstrSelection(*MIB, Node);
 }
 
+// Get call operands indexes in inline asm into CallOpNos.
+// Modifier info {NUM:P} in inline asm show that "Print branch target
+// operand for use as the argument of a call instruction." So let use
+// it and flags operands in inline asm to calculate the corresponding
+// operands indexes in inine asm.
+static bool getInlineAsmCallOperands(const char *AsmStr, SDNode *Node,
+                                     SmallSet<unsigned, 2> &CallOpNos) {
+  const char *CurPtr = AsmStr;
+  unsigned NumOperands = Node->getNumOperands();
+  while (*CurPtr) {
+    if (*CurPtr++ != '$') // Operands start with '$'
+      continue;
+
+    if (*CurPtr == '$') { // Skip "$$"
+      ++CurPtr;
+      continue;
+    }
+
+    bool HasCurlyBraces = false;
+    if (*CurPtr == '{') { // ${variable}
+      ++CurPtr;           // Consume '{' character.
+      HasCurlyBraces = true;
+    }
+
+    if (!HasCurlyBraces)
+      continue;
+
+    // If we have ${:xxx}, this is not a real operand reference, skip it.
+    if (*CurPtr == ':') {
+      ++CurPtr;
+      const char *StrStart = CurPtr;
+      const char *StrEnd = strchr(StrStart, '}');
+      if (!StrEnd)
+        report_fatal_error("Unterminated ${:xxx} operand in inline asm"
+                           " string: '" +
+                           Twine(AsmStr) + "'");
+
+      CurPtr = StrEnd + 1;
+      continue;
+    }
+
+    // Get the operand number.
+    const char *IDStart = CurPtr;
+    const char *IDEnd = IDStart;
+    while (isDigit(*IDEnd))
+      ++IDEnd;
+
+    unsigned Val;
+    if (StringRef(IDStart, IDEnd - IDStart).getAsInteger(10, Val))
+      report_fatal_error("Bad $ operand number in inline asm string: '" +
+                         Twine(AsmStr) + "'");
+    CurPtr = IDEnd;
+
+    if (Val >= NumOperands - 1)
+      report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                         Twine(AsmStr) + "'");
+
+    if (*CurPtr != ':') {
+      ++CurPtr;
+      continue;
+    }
+
+    // Check for a modifier character.  Some like ${NUM:MODIFIER}.
+    ++CurPtr; // Consume ':' character.
+
+    if (*CurPtr++ != 'P') // Not call operand
+      continue;
+
+    if (*CurPtr != '}')
+      report_fatal_error("Bad ${} expression in inline asm string: '" +
+                         Twine(AsmStr) + "'");
+    ++CurPtr; // Consume '}' character.
+
+    // We get the value number now, let's go on get the operand number.
+    // Scan to find the SDNode operand number for the Val.
+    unsigned OpNo = InlineAsm::Op_FirstOperand;
+    unsigned Flags =
+        cast<ConstantSDNode>(Node->getOperand(OpNo))->getZExtValue();
+    for (; Val; --Val) {
+      if (OpNo >= Node->getNumOperands()) {
+        report_fatal_error("invalid operand in inline asm: '" + Twine(AsmStr) +
+                           "'");
+      }
+      OpNo += InlineAsm::getNumOperandRegisters(Flags) + 1;
+      Flags = cast<ConstantSDNode>(Node->getOperand(OpNo))->getZExtValue();
+    }
+
+    if (InlineAsm::getKind(Flags) != InlineAsm::Kind_Mem)
+      continue;
+
+    // Collect the mem operand ID of call in inline asm.
+    CallOpNos.insert(OpNo);
+  }
+
+  return !CallOpNos.empty();
+}
+
+static GlobalAddressSDNode *getGAFromLoad(SDNode *Node, unsigned OpNo) {
+  SDValue Op = Node->getOperand(OpNo);
+
+  // First make sure Op is a load.
+  const MachineSDNode *MN = dyn_cast<MachineSDNode>(Op.getNode());
+  if (!MN)
+    return nullptr;
+  if (MN->memoperands_empty())
+    return nullptr;
+  MachineSDNode::mmo_iterator I = MN->memoperands_begin();
+  if (!(*I)->isLoad())
+    return nullptr;
+
+  // Make sure the Mem Index is noreg.
+  // TODO, Refine me, maybe no need to check these index for
+  // "call in inline" and "got load" with global address.
+  SDNode *NodeIndex = Node->getOperand(OpNo + 2).getNode();
+  SDNode *OpIndex = Op.getOperand(2).getNode();
+  if (cast<RegisterSDNode>(OpIndex)->getReg() ||
+      cast<RegisterSDNode>(NodeIndex)->getReg())
+    return nullptr;
+
+  // Then make sure the offset is Global Address.
+  SDNode *GA = Op.getOperand(3).getNode();
+  GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(GA);
+  return TGA;
+}
+
 /// EmitSpecialNode - Generate machine code for a target-independent node and
 /// needed dependencies.
-void InstrEmitter::
-EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
-                DenseMap<SDValue, Register> &VRBaseMap) {
+void InstrEmitter::EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                                   DenseMap<SDValue, Register> &VRBaseMap) {
   switch (Node->getOpcode()) {
   default:
 #ifndef NDEBUG
@@ -1221,8 +1345,8 @@
   case ISD::INLINEASM:
   case ISD::INLINEASM_BR: {
     unsigned NumOps = Node->getNumOperands();
-    if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
-      --NumOps;  // Ignore the glue operand.
+    if (Node->getOperand(NumOps - 1).getValueType() == MVT::Glue)
+      --NumOps; // Ignore the glue operand.
 
     // Create the inline asm machine instruction.
     unsigned TgtOpc = Node->getOpcode() == ISD::INLINEASM_BR
@@ -1239,8 +1363,8 @@
     // Add the HasSideEffect, isAlignStack, AsmDialect, MayLoad and MayStore
     // bits.
     int64_t ExtraInfo =
-      cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_ExtraInfo))->
-                          getZExtValue();
+        cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_ExtraInfo))
+            ->getZExtValue();
     MIB.addImm(ExtraInfo);
 
     // Remember to operand index of the group flags.
@@ -1249,15 +1373,44 @@
     // Remember registers that are part of early-clobber defs.
     SmallVector<unsigned, 8> ECRegs;
 
+    // Find if call operand in inine asm need to be replaced.
+    // In Linux PIC model, Global Address (GA) of Global Variable (GV) will be
+    // got from loading GOT slot. Different instructions has different
+    // understanding about using Global Address.
+    //
+    // For example in X86:
+    // 1. We assign a value to GV, we can use "MOV GV, ...", it corresping to
+    // "MOV (Global Address) ...".
+    // 2. We got related address of GV, we can use "LEA GV, ...", it corresping
+    // to "LEA (Global Address) ...".
+    // 3. But if we call the label (GV can be a label), we can use "CALL GV", it
+    // didn't equal with "CALL (Global Address)".
+    //
+    // So, it is obvious that Global Address of Global Variable should be
+    // specially handled case by case. This is done in normal IR/MIR which has
+    // "single" purpose use of the Global Address. But things changed in inline
+    // asm which represented by only one IR/MIR but may contains a lot of
+    // instructions with mult-purpose on same or different Global Address. What
+    // is more, llvm didn't distinguish the instructions in inline asm IR/MIR.
+    //
+    // TODO: The other targets may also has this problem, we need to fix them
+    // too. It is an arch defect for llvm inline asm.
+    bool Replace = false;
+    SmallSet<unsigned, 2> CallOpNos;
+    const TargetMachine &TM = MF->getTarget();
+    if (TM.getTargetTriple().isX86() && TM.isPositionIndependent() &&
+        TM.getTargetTriple().isOSLinux())
+      Replace = getInlineAsmCallOperands(AsmStr, Node, CallOpNos);
+
     // Add all of the operand registers to the instruction.
     for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
       unsigned Flags =
-        cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
       const unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
 
       GroupIdx.push_back(MIB->getNumOperands());
       MIB.addImm(Flags);
-      ++i;  // Skip the ID value.
+      ++i; // Skip the ID value.
 
       switch (InlineAsm::getKind(Flags)) {
       default: llvm_unreachable("Bad flags!");
@@ -1282,27 +1435,83 @@
           ECRegs.push_back(Reg);
         }
         break;
-      case InlineAsm::Kind_RegUse:  // Use of register.
-      case InlineAsm::Kind_Imm:  // Immediate.
-      case InlineAsm::Kind_Mem:  // Addressing mode.
+      case InlineAsm::Kind_RegUse: { // Use of register.
         // The addressing mode has been selected, just add all of the
         // operands to the machine instruction.
-        for (unsigned j = 0; j != NumVals; ++j, ++i)
+        for (unsigned J = 0; J != NumVals; ++J, ++i)
+          AddOperand(MIB, Node->getOperand(i), 0, nullptr, VRBaseMap,
+                     /*IsDebug=*/false, IsClone, IsCloned);
+
+        unsigned DefGroup = 0;
+        if (InlineAsm::isUseOperandTiedToDef(Flags, DefGroup)) {
+          unsigned DefIdx = GroupIdx[DefGroup] + 1;
+          unsigned UseIdx = GroupIdx.back() + 1;
+          for (unsigned J = 0; J != NumVals; ++J)
+            MIB->tieOperands(DefIdx + J, UseIdx + J);
+        }
+        break;
+      }
+      case InlineAsm::Kind_Imm: // Immediate.
+        for (unsigned J = 0; J != NumVals; ++J, ++i)
           AddOperand(MIB, Node->getOperand(i), 0, nullptr, VRBaseMap,
                      /*IsDebug=*/false, IsClone, IsCloned);
+        break;
+      case InlineAsm::Kind_Mem: { // Addressing mode.
+        bool IsCallOp = CallOpNos.contains(i - 1);
+
+        if (Replace && IsCallOp) {
+          GlobalAddressSDNode *TGA = nullptr;
+          if (TM.getCodeModel() == CodeModel::Large) {
+            // Handle "call func_label" in large code model.
+            // In large code model, funciton label can only be got from load.
+            // let's directly call the *(load address).
+            // TODO: Refine me:
+            // Here assume the index of "inline call func_label" is always
+            // noreg. And here may be no need to check load for "call
+            // func_label", seems func_label must be got from load in large code
+            // model.
+            SDNode *LoadCallAddr = Node->getOperand(i).getNode();
+            const MachineSDNode *MN = dyn_cast<MachineSDNode>(LoadCallAddr);
+            if (MN && !MN->memoperands_empty() &&
+                (*MN->memoperands_begin())->isLoad()) {
+              for (unsigned J = 0; J != NumVals; ++J, ++i) {
+                AddOperand(MIB, LoadCallAddr->getOperand(J), 0, nullptr,
+                           VRBaseMap,
+                           /*IsDebug=*/false, IsClone, IsCloned);
+              }
+              break;
+            }
+          } else {
+            TGA = getGAFromLoad(Node, i);
+          }
 
-        // Manually set isTied bits.
-        if (InlineAsm::getKind(Flags) == InlineAsm::Kind_RegUse) {
-          unsigned DefGroup = 0;
-          if (InlineAsm::isUseOperandTiedToDef(Flags, DefGroup)) {
-            unsigned DefIdx = GroupIdx[DefGroup] + 1;
-            unsigned UseIdx = GroupIdx.back() + 1;
-            for (unsigned j = 0; j != NumVals; ++j)
-              MIB->tieOperands(DefIdx + j, UseIdx + j);
+          // In PIC model (non-large code model)
+          // We want to print "call func_label" to "call func_label@plt".
+          if (TGA) {
+            MachineInstr *MI = MIB.getInstr();
+            unsigned OldFlagsID = MI->getNumOperands() - 1;
+            unsigned ConstraintID = InlineAsm::getMemoryConstraintID(Flags);
+            unsigned NewFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+            NewFlags = InlineAsm::getFlagWordForMem(NewFlags, ConstraintID);
+            MI->RemoveOperand(OldFlagsID);
+            MIB.addImm(NewFlags);
+
+            const GlobalValue *GV = TGA->getGlobal();
+            unsigned char OpFlags =
+                MF->getSubtarget().classifyGlobalFunctionReference(GV);
+            MIB.addGlobalAddress(TGA->getGlobal(), TGA->getOffset(), OpFlags);
+            i += NumVals;
+            break;
           }
         }
+
+        for (unsigned J = 0; J != NumVals; ++J, ++i) {
+          AddOperand(MIB, Node->getOperand(i), 0, nullptr, VRBaseMap,
+                     /*IsDebug=*/false, IsClone, IsCloned);
+        }
         break;
       }
+      }
     }
 
     // GCC inline assembly allows input operands to also be early-clobber
Index: llvm/lib/Target/X86/X86AsmPrinter.cpp
===================================================================
--- llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -605,9 +605,13 @@
         PrintMemReference(MI, OpNo, O, "H");
       }
       return false;
-    case 'P': // Don't print @PLT, but do print as memory.
+    case 'P': // Call operand modifer, E.g. omit RIP, add @PLT.
       if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
-        PrintIntelMemReference(MI, OpNo, O, "no-rip");
+        if (Subtarget->isPositionIndependent() &&
+            MI->getOperand(OpNo).isGlobal())
+          PrintSymbolOperand(MI->getOperand(OpNo), O);
+        else
+          PrintIntelMemReference(MI, OpNo, O, "no-rip");
       } else {
         PrintMemReference(MI, OpNo, O, "no-rip");
       }
Index: llvm/test/CodeGen/X86/inline-asm-call.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/inline-asm-call.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=CHECK-X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic -code-model=large | FileCheck %s --check-prefix=CHECK-X64-LARGE
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -relocation-model=pic -code-model=large | FileCheck %s --check-prefix=CHECK-X86-LARGE
+
+; The tested IR came from following test.c
+; We hope "call sincos" in inline asm should correctly generate the code with
+; PIC model, not "call *(load sincos@GOT)" which is wrong for doing 1 more
+; time load and bad for relocation.
+
+; test.c:
+; extern void sincos();
+; int Arr[10] = {1,};
+; void foo() {
+;  asm {
+;    lea ecx, Arr
+;    lea edx, sincos
+;    call sincos
+;    mov eax, 0
+;    ret }
+; }
+
+; clang t.c -fasm-blocks -S -fpic -emit-llvm test.c
+
+@Arr = global <{ i32, [9 x i32] }> <{ i32 1, [9 x i32] zeroinitializer }>, align 16
+
+define void @foo() {
+; CHECK-X64-LABEL: foo:
+; CHECK-X64:       # %bb.0: # %entry
+; CHECK-X64-NEXT:    movq sincos@GOTPCREL(%rip), %rsi
+; CHECK-X64-NEXT:    movq Arr@GOTPCREL(%rip), %rdi
+; CHECK-X64-NEXT:    #APP
+; CHECK-X64-EMPTY:
+; CHECK-X64-NEXT:    leal (%rdi), %ecx
+; CHECK-X64-NEXT:    leal (%rsi), %edx
+; CHECK-X64-NEXT:    callq sincos@PLT
+; CHECK-X64-NEXT:    movl $0, %eax
+; CHECK-X64-NEXT:    retq
+; CHECK-X64-EMPTY:
+; CHECK-X64-NEXT:    #NO_APP
+; CHECK-X64-NEXT:    retq
+;
+; CHECK-X86-LABEL: foo:
+; CHECK-X86:       # %bb.0: # %entry
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:    .cfi_offset %esi, -12
+; CHECK-X86-NEXT:    .cfi_offset %edi, -8
+; CHECK-X86-NEXT:    calll .L0$pb
+; CHECK-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-X86-NEXT:  .L0$pb:
+; CHECK-X86-NEXT:    popl %eax
+; CHECK-X86-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-X86-NEXT:  .Ltmp0:
+; CHECK-X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
+; CHECK-X86-NEXT:    movl sincos@GOT(%eax), %esi
+; CHECK-X86-NEXT:    movl Arr@GOT(%eax), %edi
+; CHECK-X86-NEXT:    #APP
+; CHECK-X86-EMPTY:
+; CHECK-X86-NEXT:    leal (%edi), %ecx
+; CHECK-X86-NEXT:    leal (%esi), %edx
+; CHECK-X86-NEXT:    calll sincos@PLT
+; CHECK-X86-NEXT:    movl $0, %eax
+; CHECK-X86-NEXT:    retl
+; CHECK-X86-EMPTY:
+; CHECK-X86-NEXT:    #NO_APP
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-X86-NEXT:    retl
+;
+; CHECK-X64-LARGE-LABEL: foo:
+; CHECK-X64-LARGE:       # %bb.0: # %entry
+; CHECK-X64-LARGE-NEXT:  .L0$pb:
+; CHECK-X64-LARGE-NEXT:    leaq .L0$pb(%rip), %rax
+; CHECK-X64-LARGE-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rsi
+; CHECK-X64-LARGE-NEXT:    addq %rax, %rsi
+; CHECK-X64-LARGE-NEXT:    movabsq $sincos@GOT, %r9
+; CHECK-X64-LARGE-NEXT:    movq (%rsi,%r9), %r8
+; CHECK-X64-LARGE-NEXT:    movabsq $Arr@GOT, %rax
+; CHECK-X64-LARGE-NEXT:    movq (%rsi,%rax), %rdi
+; CHECK-X64-LARGE-NEXT:    #APP
+; CHECK-X64-LARGE-EMPTY:
+; CHECK-X64-LARGE-NEXT:    leal (%rdi), %ecx
+; CHECK-X64-LARGE-NEXT:    leal (%r8), %edx
+; CHECK-X64-LARGE-NEXT:    callq *(%rsi,%r9)
+; CHECK-X64-LARGE-NEXT:    movl $0, %eax
+; CHECK-X64-LARGE-NEXT:    retq
+; CHECK-X64-LARGE-EMPTY:
+; CHECK-X64-LARGE-NEXT:    #NO_APP
+; CHECK-X64-LARGE-NEXT:    retq
+;
+; CHECK-X86-LARGE-LABEL: foo:
+; CHECK-X86-LARGE:       # %bb.0: # %entry
+; CHECK-X86-LARGE-NEXT:    pushl %ebx
+; CHECK-X86-LARGE-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-X86-LARGE-NEXT:    pushl %edi
+; CHECK-X86-LARGE-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-X86-LARGE-NEXT:    pushl %esi
+; CHECK-X86-LARGE-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-X86-LARGE-NEXT:    .cfi_offset %esi, -16
+; CHECK-X86-LARGE-NEXT:    .cfi_offset %edi, -12
+; CHECK-X86-LARGE-NEXT:    .cfi_offset %ebx, -8
+; CHECK-X86-LARGE-NEXT:    calll .L0$pb
+; CHECK-X86-LARGE-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-X86-LARGE-NEXT:  .L0$pb:
+; CHECK-X86-LARGE-NEXT:    popl %esi
+; CHECK-X86-LARGE-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-X86-LARGE-NEXT:  .Ltmp0:
+; CHECK-X86-LARGE-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %esi
+; CHECK-X86-LARGE-NEXT:    movl sincos@GOT(%esi), %edi
+; CHECK-X86-LARGE-NEXT:    movl Arr@GOT(%esi), %ebx
+; CHECK-X86-LARGE-NEXT:    #APP
+; CHECK-X86-LARGE-EMPTY:
+; CHECK-X86-LARGE-NEXT:    leal (%ebx), %ecx
+; CHECK-X86-LARGE-NEXT:    leal (%edi), %edx
+; CHECK-X86-LARGE-NEXT:    lcalll *sincos@GOT(%esi)
+; CHECK-X86-LARGE-NEXT:    movl $0, %eax
+; CHECK-X86-LARGE-NEXT:    retl
+; CHECK-X86-LARGE-EMPTY:
+; CHECK-X86-LARGE-NEXT:    #NO_APP
+; CHECK-X86-LARGE-NEXT:    popl %esi
+; CHECK-X86-LARGE-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-X86-LARGE-NEXT:    popl %edi
+; CHECK-X86-LARGE-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-X86-LARGE-NEXT:    popl %ebx
+; CHECK-X86-LARGE-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-X86-LARGE-NEXT:    retl
+entry:
+  call void asm sideeffect inteldialect "lea ecx, $0\0A\09lea edx, qword ptr $1\0A\09call qword ptr ${2:P}\0A\09mov eax, $$0\0A\09ret", "*m,*m,*m,~{eax},~{ecx},~{edx},~{dirflag},~{fpsr},~{flags}"([10 x i32]* bitcast (<{ i32, [9 x i32] }>* @Arr to [10 x i32]*), void (...)* @sincos, void (...)* @sincos)
+  ret void
+}
+
+declare void @sincos(...)