Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -812,6 +812,21 @@
          !MI.getOperand(1).isUndef();
 }
 
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+  // Currently all conventions wait, but this may not always be the case.
+  //
+  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+  // senses to omit the wait and do it in the caller.
+  return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
+  return true;
+}
+
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -927,91 +942,91 @@
       }
     }
 
-#if 0 // TODO: the following code to handle CALL.
-    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
-    // However, there is a problem with EXP_CNT, because the call cannot
-    // easily tell if a register is used in the function, and if it did, then
-    // the referring instruction would have to have an S_WAITCNT, which is
-    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
-    // before the call.
-    if (MI.getOpcode() == SC_CALL) {
-      if (ScoreBrackets->getScoreUB(EXP_CNT) >
-        ScoreBrackets->getScoreLB(EXP_CNT)) {
-        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-        EmitWaitcnt |= CNT_MASK(EXP_CNT);
-      }
-    }
-#endif
-
-    // FIXME: Should not be relying on memoperands.
-    // Look at the source operands of every instruction to see if
-    // any of them results from a previous memory operation that affects
-    // its current usage. If so, an s_waitcnt instruction needs to be
-    // emitted.
-    // If the source operand was defined by a load, add the s_waitcnt
-    // instruction.
-    for (const MachineMemOperand *Memop : MI.memoperands()) {
-      unsigned AS = Memop->getAddrSpace();
-      if (AS != AMDGPUAS::LOCAL_ADDRESS)
-        continue;
-      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-      // VM_CNT is only relevant to vgpr or LDS.
-      ScoreBrackets.determineWait(
-          VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-    }
+    if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+      // Don't bother waiting on anything except the call address. The function
+      // is going to insert a wait on everything in its prolog. This still needs
+      // to be careful if the call target is a load (e.g. a GOT load).
+      Wait = AMDGPU::Waitcnt();
 
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      const MachineOperand &Op = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+      int CallAddrOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+      RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
+                                                          CallAddrOpIdx, false);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Op.getReg())) {
-          // VM_CNT is only relevant to vgpr or LDS.
-          ScoreBrackets.determineWait(
-              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        }
         ScoreBrackets.determineWait(
             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
-    }
-    // End of for loop that looks at all source operands to decide vm_wait_cnt
-    // and lgk_wait_cnt.
-
-    // Two cases are handled for destination operands:
-    // 1) If the destination operand was defined by a load, add the s_waitcnt
-    // instruction to guarantee the right WAW order.
-    // 2) If a destination operand that was used by a recent export/store ins,
-    // add s_waitcnt on exp_cnt to guarantee the WAR order.
-    if (MI.mayStore()) {
+    } else {
       // FIXME: Should not be relying on memoperands.
+      // Look at the source operands of every instruction to see if
+      // any of them results from a previous memory operation that affects
+      // its current usage. If so, an s_waitcnt instruction needs to be
+      // emitted.
+      // If the source operand was defined by a load, add the s_waitcnt
+      // instruction.
       for (const MachineMemOperand *Memop : MI.memoperands()) {
         unsigned AS = Memop->getAddrSpace();
         if (AS != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        // VM_CNT is only relevant to vgpr or LDS.
         ScoreBrackets.determineWait(
             VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        ScoreBrackets.determineWait(
-            EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
       }
-    }
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      MachineOperand &Def = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
-      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Def.getReg())) {
+
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        const MachineOperand &Op = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Op.getReg())) {
+            // VM_CNT is only relevant to vgpr or LDS.
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      }
+      // End of for loop that looks at all source operands to decide vm_wait_cnt
+      // and lgk_wait_cnt.
+
+      // Two cases are handled for destination operands:
+      // 1) If the destination operand was defined by a load, add the s_waitcnt
+      // instruction to guarantee the right WAW order.
+      // 2) If a destination operand that was used by a recent export/store ins,
+      // add s_waitcnt on exp_cnt to guarantee the WAR order.
+      if (MI.mayStore()) {
+        // FIXME: Should not be relying on memoperands.
+        for (const MachineMemOperand *Memop : MI.memoperands()) {
+          unsigned AS = Memop->getAddrSpace();
+          if (AS != AMDGPUAS::LOCAL_ADDRESS)
+            continue;
+          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
           ScoreBrackets.determineWait(
               VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
           ScoreBrackets.determineWait(
               EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
         }
-        ScoreBrackets.determineWait(
-            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
-    } // End of for loop that looks at all dest operands.
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        MachineOperand &Def = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Def.getReg())) {
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+            ScoreBrackets.determineWait(
+                EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      } // End of for loop that looks at all dest operands.
+    }
   }
 
   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
@@ -1228,6 +1243,14 @@
     }
   } else if (TII->isSMRD(Inst)) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else if (Inst.isCall()) {
+    if (callWaitsOnFunctionReturn(Inst)) {
+      // Act as a wait on everything
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+    } else {
+      // May need to way wait for anything.
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+    }
   } else {
     switch (Inst.getOpcode()) {
     case AMDGPU::S_SENDMSG:
Index: test/CodeGen/AMDGPU/call-argument-types.ll
===================================================================
--- test/CodeGen/AMDGPU/call-argument-types.ll
+++ test/CodeGen/AMDGPU/call-argument-types.ll
@@ -154,7 +154,7 @@
 ; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s3
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
@@ -174,7 +174,7 @@
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
@@ -205,7 +205,7 @@
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
@@ -224,7 +224,7 @@
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
@@ -267,8 +267,8 @@
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
 ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* null
   call void @external_void_func_v2i64(<2 x i64> %val)
@@ -290,8 +290,8 @@
 ; GCN: buffer_load_dwordx4 v[0:3]
 ; GCN: v_mov_b32_e32 v4, 1
 ; GCN: v_mov_b32_e32 v5, 2
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
@@ -307,8 +307,8 @@
 ; GCN-DAG: v_mov_b32_e32 v6, 3
 ; GCN-DAG: v_mov_b32_e32 v7, 4
 
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -483,8 +483,8 @@
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
 ; GCN: buffer_load_dwordx2 v[0:1]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
   call void @external_void_func_v2i32(<2 x i32> %val)
@@ -527,8 +527,8 @@
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
 ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
   call void @external_void_func_v4i32(<4 x i32> %val)
@@ -562,8 +562,8 @@
 ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
   %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
@@ -591,8 +591,8 @@
 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
 ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
   %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
@@ -609,8 +609,8 @@
 ; GCN-DAG: buffer_load_dwordx4 v[20:23], off
 ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
 ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
   %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
@@ -647,12 +647,11 @@
   ret void
 }
 
-; FIXME: No wait after call
 ; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
 ; GCN: v_mov_b32_e32 v0, 42
 ; GCN: s_swappc_b64 s[30:31],
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
+; GCN-NOT: s_waitcnt
+; GCN: buffer_store_dword v0, off, s[36:39], 0
 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
   %val = call i32 @external_i32_func_i32(i32 42)
   store volatile i32 %val, i32 addrspace(1)* %out
@@ -662,8 +661,8 @@
 ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
 ; GCN: buffer_load_ubyte v0, off
 ; GCN: buffer_load_dword v1, off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
Index: test/CodeGen/AMDGPU/call-waitcnt.ll
===================================================================
--- test/CodeGen/AMDGPU/call-waitcnt.ll
+++ test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -17,7 +17,6 @@
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func@rel32@hi+4
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
@@ -67,7 +66,6 @@
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NEXT:    v_mov_b32_e32 v1, s35
 ; GCN-NEXT:    global_store_dword v[0:1], v32, off
@@ -91,7 +89,6 @@
 ; GCN-NEXT:    s_addc_u32 s7, s7, func.return@rel32@hi+4
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NEXT:    v_mov_b32_e32 v2, s35
 ; GCN-NEXT:    global_store_dword v[1:2], v0, off
@@ -138,7 +135,7 @@
   ret void
 }
 
-; Need to wait for the address dependency
+; No need to wait for the load.
 define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
 ; GCN-LABEL: tail_call_memory_arg_load:
 ; GCN:       ; %bb.0:
@@ -147,7 +144,6 @@
 ; GCN-NEXT:    s_add_u32 s6, s6, func@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func@rel32@hi+4
 ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[6:7]
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
   tail call void @func(i32 %vgpr)