Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -92,6 +92,9 @@
 ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
+void initializeAMDGPUDeviceLibsPass(PassRegistry &);
+ModulePass *createAMDGPUDeviceLibsPass(TargetMachine& tm);
+
 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
 extern char &SIFixControlFlowLiveIntervalsID;
 
Index: lib/Target/AMDGPU/AMDGPUDeviceLibs.h
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPUDeviceLibs.h
@@ -0,0 +1,17 @@
+#ifndef AMDGPU_DEVICE_LIBS_H
+#define AMDGPU_DEVICE_LIBS_H
+
+enum AMDGPUOptionMask {
+  AMD_OPT = 1,
+  UNSAFE_FP_MATH = 2,
+  NO_NANS_FP_MATH = 4,
+  NO_INFS_FP_MATH = 8,
+  FAST_FMA32 = 16,
+  FAST_FMA64 = 32,
+  FP32_DENORMS = 64,
+  FP64_DENORMS = 128,
+  ISA_VERSION_SHIFT = 8,
+  ISA_VERSION_WIDTH = 4,
+};
+
+#endif /* AMDGPU_DEVICE_LIBS_H */
Index: lib/Target/AMDGPU/AMDGPUDeviceLibsPass.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPUDeviceLibsPass.cpp
@@ -0,0 +1,278 @@
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/CodeGen/Passes.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUDeviceLibs.h"
+
+#define DEBUG_TYPE "amdgpu-device-libs"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUDeviceLibs : public ModulePass {
+  const TargetMachine* TM;
+
+  uint64_t KernelOptionMask(Function& F);
+
+  bool IsAttributeSet(Function &F, StringRef Attr) {
+    return
+      F.hasFnAttribute(Attr) &&
+      (F.getFnAttribute(Attr).getValueAsString() != "false");
+  }
+
+public:
+  static char ID;
+  AMDGPUDeviceLibs(const TargetMachine* TM_ = 0)
+    : ModulePass(ID), TM(TM_) {}
+
+  bool runOnModule(Module &M) override;
+};
+
+uint64_t AMDGPUDeviceLibs::KernelOptionMask(Function& F) {
+  uint64_t mask = 0;
+  mask |= AMD_OPT; // Always set.
+  if (IsAttributeSet(F, "unsafe-fp-math")) { mask |= UNSAFE_FP_MATH; }
+  if (IsAttributeSet(F, "no-nans-fp-math")) { mask |= NO_NANS_FP_MATH; }
+  if (IsAttributeSet(F, "no-infs-fp-math")) { mask |= NO_INFS_FP_MATH; }
+  const SISubtarget* ST = static_cast<const SISubtarget*>(TM->getSubtargetImpl(F));
+  if (ST->hasFastFMAF32()) { mask |= FAST_FMA32; }
+  AMDGPU::IsaVersion ISA = ST->getIsaVersion();
+  mask |= (ISA.Major * 100 + ISA.Minor * 10 + ISA.Stepping) << ISA_VERSION_SHIFT;
+
+  return mask;
+}
+
+struct NewArgInfo {
+  Type *Ty;
+  const char *Name;
+};
+
+FunctionType *GetFunctionTypeWithNewArguments(FunctionType *FTy,
+                                              ArrayRef<NewArgInfo> NArgs)
+{
+  unsigned NumNArgs = NArgs.size();
+  SmallVector<Type *, 16> ArgTypes;
+  ArgTypes.reserve(NumNArgs + FTy->getNumParams());
+
+  unsigned Idx;
+  for (Idx = 0; Idx != NumNArgs; ++Idx)
+    ArgTypes.push_back(NArgs[Idx].Ty);
+
+  ArgTypes.insert(ArgTypes.end(), FTy->param_begin(), FTy->param_end());
+
+
+  return FunctionType::get(FTy->getReturnType(), ArgTypes, FTy->isVarArg());
+}
+
+static AttributeSet ShiftAttributes(LLVMContext &Ctx,
+                                    const AttributeSet& PAL,
+                                    unsigned NumParams,
+                                    unsigned Shift)
+{
+  SmallVector<AttributeSet, 8> Attributes;
+  Attributes.push_back(AttributeSet::get(Ctx, PAL.getRetAttributes()));
+  for (unsigned Idx = 1; Idx <= NumParams; ++Idx) {
+    AttributeSet params = PAL.getParamAttributes(Idx);
+    AttrBuilder B(params, Idx);
+    Attributes.push_back(AttributeSet::get(Ctx, Idx + Shift, B));
+  }
+  Attributes.push_back(AttributeSet::get(Ctx, PAL.getFnAttributes()));
+  return AttributeSet::get(Ctx, Attributes);
+}
+
+Function *CreateFunctionWithNewArguments(Function* F,
+                                         ArrayRef<NewArgInfo> NArgs)
+{
+  unsigned NumNArgs = NArgs.size();
+
+  // Create a new function type
+  FunctionType *NFTy = cast<FunctionType>(F->getType()->getElementType());
+  NFTy = GetFunctionTypeWithNewArguments(NFTy, NArgs);
+
+  // Create the new function
+  Function *NF = Function::Create(NFTy, F->getLinkage());
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+
+  Function::arg_iterator NI = NF->arg_begin();
+  for (unsigned Idx = 0; Idx != NumNArgs; ++Idx, ++NI)
+    if (const char *NName = NArgs[Idx].Name)
+      NI->setName(NName);
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->takeName(F);
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over
+  // to the new arguments, also transferring over the names as well.
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++NI) {
+    I->replaceAllUsesWith(&*NI);
+    NI->takeName(&*I);
+  }
+
+  NF->GlobalValue::copyAttributesFrom(F);
+  NF->setAttributes(ShiftAttributes(F->getParent()->getContext(),
+                                    F->getAttributes(),
+                                    (unsigned)F->arg_size(),
+                                    NumNArgs));
+
+  NF->setCallingConv(F->getCallingConv());
+  if (F->hasGC())
+    NF->setGC(F->getGC());
+  else
+    NF->clearGC();
+
+  return NF;
+}
+
+void AddArgumentsToCallSite(CallSite &CS, SmallVectorImpl<Value*> &Args,
+                            Value *Callee)
+{
+  Instruction *Call = CS.getInstruction();
+  unsigned NumNArgs = (unsigned)Args.size();
+
+  Args.insert(Args.end(), CS.arg_begin(), CS.arg_end());
+
+  auto NPAL = ShiftAttributes(CS.getCaller()->getParent()->getContext(),
+                              CS.getAttributes(),
+                              (unsigned)CS.arg_size(), NumNArgs);
+
+  Instruction *NCall;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+    InvokeInst *NII = InvokeInst::Create(Callee, II->getNormalDest(),
+                                         II->getUnwindDest(),
+                                         Args, "", Call);
+    NII->setCallingConv(II->getCallingConv());
+    NII->setAttributes(NPAL);
+
+    NCall = NII;
+  } else {
+    CallInst *CI = cast<CallInst>(Call);
+    CallInst *NCI = CallInst::Create(Callee, Args, "", Call);
+    NCI->setCallingConv(CI->getCallingConv());
+    NCI->setAttributes(NPAL);
+    if (CI->isTailCall())
+      NCI->setTailCall();
+
+    NCall = NCI;
+  }
+  NCall->setDebugLoc(Call->getDebugLoc());
+
+  if (!Call->use_empty())
+    Call->replaceAllUsesWith(NCall);
+
+  NCall->takeName(Call);
+
+  // Remove the old call from the program, reducing the use-count of F.
+  Call->eraseFromParent();
+}
+
+bool AMDGPUDeviceLibs::runOnModule(Module &M) {
+  bool Changed = false;
+  SmallVector<Instruction *, 16> ToRemove;
+
+  std::set<Function*> Worklist, Functions;
+  std::set<CallInst*> Calls;
+
+  for (Function& F : M.getFunctionList()) {
+    Worklist.insert(&F);
+  }
+
+  while (!Worklist.empty()) {
+    Function* F = *Worklist.begin();
+    Worklist.erase(F);
+    Changed = false;
+    for (Instruction &I : instructions(F)) {
+      CallInst *Call = dyn_cast<CallInst>(&I);
+      if (!Call)
+        continue;
+      Function *Callee = Call->getCalledFunction();
+      if (!Callee)
+        continue;
+      if (Callee->getName() == "__oclc_option_mask" ||
+          Functions.find(Callee) != Functions.end()) {
+        Changed |= Functions.insert(F).second;
+        Changed |= Calls.insert(Call).second;
+      }
+    }
+    if (Changed) {
+      for (Use& US : F->uses()) {
+        User* U = US.getUser();
+        if (Instruction *Inst = dyn_cast<Instruction>(U)) {
+          Function* Caller = Inst->getFunction();
+          if (Functions.find(Caller) == Functions.end()) {
+            Functions.insert(Caller);
+            Worklist.insert(Caller);
+          }
+        }
+      }
+    }
+  }
+
+  std::map<Function*, Function*> NewFunctions;
+
+  for (Function* F : Functions) {
+    if (F->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL ||
+        F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
+      continue;
+    }
+    SmallVector<NewArgInfo, 1> NewArgsInfo;
+    NewArgInfo Info;
+    Info.Ty = IntegerType::get(M.getContext(), 64);
+    Info.Name = "__option_mask";
+    NewArgsInfo.push_back(Info);
+    Function* NF = CreateFunctionWithNewArguments(F, NewArgsInfo);
+    NewFunctions[F] = NF;
+  }
+
+  for (CallInst* Call : Calls) {
+    Function* Caller = Call->getFunction();
+    Function* Callee = Call->getCalledFunction();
+    Value* OptionMask = nullptr;
+    if (Caller->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL ||
+        Caller->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
+      OptionMask = ConstantInt::get(Call->getType(), KernelOptionMask(*Caller));
+    } else {
+      for (Argument& arg : Caller->args()) {
+        if (arg.getName() == "__option_mask") {
+          OptionMask = &arg;
+          break;
+        }
+      }
+    }
+    assert(OptionMask);
+
+    if (Callee->getName() == "__oclc_option_mask") {
+      Call->replaceAllUsesWith(OptionMask);
+      ToRemove.push_back(Call);
+    } else {
+      CallSite CS(Call);
+      Function* NewCallee = NewFunctions[Callee];
+      Call->replaceUsesOfWith(Callee, NewCallee);
+      SmallVector<Value*, 16> NewArgs;
+      NewArgs.push_back(OptionMask);
+      AddArgumentsToCallSite(CS, NewArgs, NewCallee);
+    }
+  }
+
+  for (Instruction *I : ToRemove)
+    I->eraseFromParent();
+  return ToRemove.size() > 0;
+}
+
+char AMDGPUDeviceLibs::ID = 0;
+}
+
+ModulePass *llvm::createAMDGPUDeviceLibsPass(TargetMachine& tm) {
+  return new AMDGPUDeviceLibs(&tm);
+}
+
+INITIALIZE_TM_PASS(AMDGPUDeviceLibs, DEBUG_TYPE,
+                   "AMDGPU Device libraries pass", false, false)
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -50,6 +50,7 @@
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+  void addPreLinkPasses(PassManagerBase &) override;
 };
 
 //===----------------------------------------------------------------------===//
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -89,6 +89,7 @@
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIDebuggerInsertNopsPass(*PR);
+  initializeAMDGPUDeviceLibsPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -169,6 +170,10 @@
     FSAttr.getValueAsString();
 }
 
+void AMDGPUTargetMachine::addPreLinkPasses(PassManagerBase &PM) {
+  PM.add(createAMDGPUDeviceLibsPass(*this));
+}
+
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -34,6 +34,7 @@
   AMDGPUAnnotateUniformValues.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUDeviceLibsPass.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
Index: test/CodeGen/AMDGPU/device-libs.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/device-libs.ll
@@ -0,0 +1,85 @@
+; RUN: opt -S -verify-machineinstrs -amdgpu-device-libs -inline -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck --check-prefix=SI %s
+; RUN: opt -S -verify-machineinstrs -amdgpu-device-libs -inline -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck --check-prefix=VI %s
+
+declare extern_weak i64 @__oclc_option_mask() local_unnamed_addr
+
+define i64 @get_option_mask() local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__oclc_option_mask() #1
+  ret i64 %call
+}
+
+define i64 @get_option_mask2() local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @get_option_mask() #1
+  ret i64 %call
+}
+
+
+; FUNC-LABEL: {{^}}default_kernel:
+; SI: store i64 179473, i64 addrspace(1)* %out
+; VI: store i64 205569, i64 addrspace(1)* %out
+define amdgpu_kernel void @default_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #2 {
+entry:
+  %call = tail call i64 @__oclc_option_mask() #1
+  store i64 %call, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}default_call_kernel:
+; SI: store i64 179473, i64 addrspace(1)* %out
+; VI: store i64 205569, i64 addrspace(1)* %out
+define amdgpu_kernel void @default_call_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #2 {
+entry:
+  %call = tail call i64 @get_option_mask() #0
+  store i64 %call, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}default_call2_kernel:
+; SI: store i64 179473, i64 addrspace(1)* %out
+; VI: store i64 205569, i64 addrspace(1)* %out
+define amdgpu_kernel void @default_call2_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #2 {
+entry:
+  %call = tail call i64 @get_option_mask2() #0
+  store i64 %call, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}unsafe_fp_kernel:
+; SI: store i64 179475, i64 addrspace(1)* %out
+; VI: store i64 205571, i64 addrspace(1)* %out
+define amdgpu_kernel void @unsafe_fp_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #3 {
+entry:
+  %call = tail call i64 @__oclc_option_mask() #1
+  store i64 %call, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_nans_fp_math_kernel:
+; SI: store i64 179477, i64 addrspace(1)* %out
+; VI: store i64 205573, i64 addrspace(1)* %out
+define amdgpu_kernel void @no_nans_fp_math_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #4 {
+entry:
+  %call = tail call i64 @__oclc_option_mask() #1
+  store i64 %call, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_infs_fp_math_kernel:
+; SI: store i64 179481, i64 addrspace(1)* %out
+; VI: store i64 205577, i64 addrspace(1)* %out
+define amdgpu_kernel void @no_infs_fp_math_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #5 {
+entry:
+  %call = tail call i64 @__oclc_option_mask() #1
+  store i64 %call, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind "unsafe-fp-math"="true" }
+attributes #4 = { nounwind "no-nans-fp-math"="true" }
+attributes #5 = { nounwind "no-infs-fp-math"="true" }