Index: lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- lib/Target/NVPTX/NVPTXInstrInfo.td +++ lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1826,6 +1826,7 @@ "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), []>; +let isCall = 1 in { def PrintCallRetInst1 : NVPTXInst<(outs), (ins), "call (retval0), ", [(PrintCall (i32 1))]>; @@ -1883,6 +1884,7 @@ def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ", [(PrintCallUni (i32 0))]>; +} // call instructions def LoadParamMemI64 : LoadParamMemInst; def LoadParamMemI32 : LoadParamMemInst; Index: test/CodeGen/NVPTX/MachineSink-call.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/MachineSink-call.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s | FileCheck %s +target triple = "nvptx64-nvidia-cuda" + +declare void @foo() + +; Load a value, then call a function. Branch, and use the loaded value only on +; one side of the branch. The load shouldn't be sunk beneath the call, because +; the call may modify memory. +define i32 @f(i32 %x, i32* %ptr, i1 %cond) { +Start: + ; CHECK: ld.u32 + %ptr_val = load i32, i32* %ptr + ; CHECK: call.uni + call void @foo() + br i1 %cond, label %L1, label %L2 +L1: + %ptr_val2 = add i32 %ptr_val, 100 + br label %L2 +L2: + %v4 = phi i32 [ %x, %Start ], [ %ptr_val2, %L1 ] + %v5 = add i32 %v4, 1000 + ret i32 %v5 +}