Index: lib/Target/AMDGPU/AMDGPULibCalls.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -131,6 +131,9 @@
   // sin/cos
   bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);
 
+  // __read_pipe/__write_pipe
+  bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
+
   // Get insertion point at entry.
   BasicBlock::iterator getEntryIns(CallInst * UI);
   // Insert an Alloc instruction.
@@ -458,11 +461,11 @@
 }
 
 static inline int getVecSize(const AMDGPULibFunc& FInfo) {
-  return FInfo.Leads[0].VectorSize;
+  return FInfo.getLeads()[0].VectorSize;
 }
 
 static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
-  return (AMDGPULibFunc::EType)FInfo.Leads[0].ArgType;
+  return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
 }
 
 Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) {
@@ -507,8 +510,8 @@
     Value *opr0 = aCI->getArgOperand(0);
 
     AMDGPULibFunc nf;
-    nf.Leads[0].ArgType = FInfo.Leads[0].ArgType;
-    nf.Leads[0].VectorSize = FInfo.Leads[0].VectorSize;
+    nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
+    nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
 
     nf.setPrefix(AMDGPULibFunc::NATIVE);
     nf.setId(AMDGPULibFunc::EI_SIN);
@@ -537,11 +540,10 @@
   Function *Callee = aCI->getCalledFunction();
 
   FuncInfo FInfo;
-  if (!parseFunctionName(Callee->getName(), &FInfo) ||
+  if (!parseFunctionName(Callee->getName(), &FInfo) || !FInfo.isMangled() ||
       FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
-      getArgType(FInfo) == AMDGPULibFunc::F64 ||
-      !HasNative(FInfo.getId()) ||
-      !(AllNative || useNativeFunc(FInfo.getName())) ) {
+      getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) ||
+      !(AllNative || useNativeFunc(FInfo.getName()))) {
     return false;
   }
 
@@ -559,6 +561,74 @@
   return true;
 }
 
+// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
+// builtin, with appended type size and alignment arguments, where 2 or 4
+// indicates the original number of arguments. The library has optimized version
+// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
+// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
+// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
+// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
+bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
+                                          FuncInfo &FInfo) {
+  auto *Callee = CI->getCalledFunction();
+  if (!Callee->isDeclaration())
+    return false;
+
+  assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
+  auto *M = Callee->getParent();
+  auto &Ctx = M->getContext();
+  std::string Name = Callee->getName();
+  auto NumArg = CI->getNumArgOperands();
+  if (NumArg != 4 && NumArg != 6)
+    return false;
+  auto *PacketSize = CI->getArgOperand(NumArg - 2);
+  auto *PacketAlign = CI->getArgOperand(NumArg - 1);
+  if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
+    return false;
+  unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
+  unsigned Align = cast<ConstantInt>(PacketAlign)->getZExtValue();
+  if (Size != Align || !isPowerOf2_32(Size))
+    return false;
+
+  Type *PtrElemTy;
+  if (Size <= 8)
+    PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
+  else
+    PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8);
+  unsigned PtrArgLoc = CI->getNumArgOperands() - 3;
+  auto PtrArg = CI->getArgOperand(PtrArgLoc);
+  unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
+  auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS);
+
+  SmallVector<llvm::Type *, 6> ArgTys;
+  for (unsigned I = 0; I != PtrArgLoc; ++I)
+    ArgTys.push_back(CI->getArgOperand(I)->getType());
+  ArgTys.push_back(PtrTy);
+
+  Name = Name + "_" + std::to_string(Size);
+
+  auto *FTy = FunctionType::get(Callee->getReturnType(),
+                                ArrayRef<Type *>(ArgTys), false);
+  auto *BCast = B.CreatePointerCast(PtrArg, PtrTy);
+
+  SmallVector<Value *, 6> Args;
+  for (unsigned I = 0; I != PtrArgLoc; ++I)
+    Args.push_back(CI->getArgOperand(I));
+  Args.push_back(BCast);
+
+  AMDGPULibFunc NewLibFunc(Name, FTy);
+  auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
+  if (!F)
+    return false;
+  auto *NCI = B.CreateCall(F, Args);
+  NCI->setAttributes(CI->getAttributes());
+  CI->replaceAllUsesWith(NCI);
+  CI->dropAllReferences();
+  CI->eraseFromParent();
+
+  return true;
+}
+
 // This function returns false if no change; return true otherwise.
 bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
   this->CI = CI;
@@ -636,6 +706,11 @@
       return fold_sincos(CI, B, AA);
 
     break;
+  case AMDGPULibFunc::EI_READ_PIPE_2:
+  case AMDGPULibFunc::EI_READ_PIPE_4:
+  case AMDGPULibFunc::EI_WRITE_PIPE_2:
+  case AMDGPULibFunc::EI_WRITE_PIPE_4:
+    return fold_read_write_pipe(CI, B, FInfo);
 
   default:
     break;
@@ -1259,7 +1334,7 @@
   // for OpenCL 2.0 we have only generic implementation of sincos
   // function.
   AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
-  nf.Leads[0].PtrKind = AMDGPULibFunc::GENERIC;
+  nf.getLeads()[0].PtrKind = AMDGPULibFunc::GENERIC;
   Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
   if (!Fsincos) return false;
 
Index: lib/Target/AMDGPU/AMDGPULibFunc.h
===================================================================
--- lib/Target/AMDGPU/AMDGPULibFunc.h
+++ lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -18,7 +18,7 @@
 class Function;
 class Module;
 
-class AMDGPULibFunc {
+class AMDGPULibFuncBase {
 public:
   enum EFuncId {
     EI_NONE,
@@ -26,6 +26,14 @@
     // IMPORTANT: enums below should go in ascending by 1 value order
     // because they are used as indexes in the mangling rules table.
     // don't use explicit value assignment.
+    //
+    // There are two types of library functions: those with mangled
+    // name and those with unmangled name. The enums for the library
+    // functions with mangled name are defined before enums for the
+    // library functions with unmangled name. The enum for the last
+    // library function with mangled name is EI_LAST_MANGLED.
+    //
+    // Library functions with mangled name.
     EI_ABS,
     EI_ABS_DIFF,
     EI_ACOS,
@@ -144,7 +152,6 @@
     EI_POWR,
     EI_PREFETCH,
     EI_RADIANS,
-    EI_READ_PIPE,
     EI_RECIP,
     EI_REMAINDER,
     EI_REMQUO,
@@ -212,7 +219,6 @@
     EI_WRITE_IMAGEF,
     EI_WRITE_IMAGEI,
     EI_WRITE_IMAGEUI,
-    EI_WRITE_PIPE,
     EI_NCOS,
     EI_NEXP2,
     EI_NFMA,
@@ -225,6 +231,14 @@
     EI_FLDEXP,
     EI_CLASS,
     EI_RCBRT,
+    EI_LAST_MANGLED =
+        EI_RCBRT, /* The last library function with mangled name */
+
+    // Library functions with unmangled name.
+    EI_READ_PIPE_2,
+    EI_READ_PIPE_4,
+    EI_WRITE_PIPE_2,
+    EI_WRITE_PIPE_4,
 
     EX_INTRINSICS_COUNT
   };
@@ -298,51 +312,144 @@
     template <typename Stream>
     void mangleItanium(Stream& os);
   };
+  static bool isMangled(EFuncId Id) {
+    return static_cast<unsigned>(Id) <= static_cast<unsigned>(EI_LAST_MANGLED);
+  }
+};
 
+class AMDGPULibFuncImpl : public AMDGPULibFuncBase {
 public:
-  static bool      parse(StringRef mangledName, AMDGPULibFunc &iInfo);
-
-  AMDGPULibFunc();
-  AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom);
+  AMDGPULibFuncImpl() {}
+  virtual ~AMDGPULibFuncImpl() {}
 
-  ENamePrefix   getPrefix() const { return FKind; }
-  EFuncId  getId() const { return FuncId; }
+  /// Get unmangled name for mangled library function and name for unmangled
+  /// library function.
+  virtual std::string getName() const = 0;
+  virtual unsigned getNumArgs() const = 0;
+  EFuncId getId() const { return FuncId; }
+  ENamePrefix getPrefix() const { return FKind; }
 
-  std::string   getName() const;
-  unsigned      getNumArgs() const;
+  bool isMangled() const { return AMDGPULibFuncBase::isMangled(FuncId); }
 
-  FunctionType* getFunctionType(Module& M) const;
+  void setId(EFuncId id) { FuncId = id; }
+  virtual bool parseFuncName(StringRef &mangledName) = 0;
 
-  std::string   mangle() const;
+  /// \return The mangled function name for mangled library functions
+  /// and unmangled function name for unmangled library functions.
+  virtual std::string mangle() const = 0;
 
+  void setName(StringRef N) { Name = N; }
   void setPrefix(ENamePrefix pfx) { FKind = pfx; }
-  void setId(EFuncId id) { FuncId = id; }
-
-  static Function* getFunction(llvm::Module *M, const AMDGPULibFunc& fInfo);
 
-  static Function* getOrInsertFunction(llvm::Module *M,
-                                       const AMDGPULibFunc& fInfo);
+  virtual FunctionType *getFunctionType(Module &M) const = 0;
 
-  static StringRef getUnmangledName(const StringRef& mangledName);
+protected:
+  EFuncId FuncId;
+  std::string Name;
+  ENamePrefix FKind;
+};
 
-  Param         Leads[2];
+/// Wrapper class for AMDGPULIbFuncImpl
+class AMDGPULibFunc : public AMDGPULibFuncBase {
+public:
+  explicit AMDGPULibFunc() : Impl(std::unique_ptr<AMDGPULibFuncImpl>()) {}
+  AMDGPULibFunc(const AMDGPULibFunc &F);
+  /// Clone a mangled library func with the Id \p Id and argument info from \p
+  /// CopyFrom.
+  explicit AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom);
+  /// Construct an unmangled library function on the fly.
+  explicit AMDGPULibFunc(StringRef FName, FunctionType *FT);
+
+  AMDGPULibFunc &operator=(const AMDGPULibFunc &F);
+
+  /// Get unmangled name for mangled library function and name for unmangled
+  /// library function.
+  std::string getName() const { return Impl->getName(); }
+  unsigned getNumArgs() const { return Impl->getNumArgs(); }
+  EFuncId getId() const { return Impl->getId(); }
+  ENamePrefix getPrefix() const { return Impl->getPrefix(); }
+  /// Get leading parameters for mangled lib functions.
+  Param *getLeads();
+  const Param *getLeads() const;
+
+  bool isMangled() const { return Impl->isMangled(); }
+  void setId(EFuncId Id) { Impl->setId(Id); }
+  bool parseFuncName(StringRef &MangledName) {
+    return Impl->parseFuncName(MangledName);
+  }
+
+  /// \return The mangled function name for mangled library functions
+  /// and unmangled function name for unmangled library functions.
+  std::string mangle() const { return Impl->mangle(); }
+
+  void setName(StringRef N) { Impl->setName(N); }
+  void setPrefix(ENamePrefix PFX) { Impl->setPrefix(PFX); }
+
+  FunctionType *getFunctionType(Module &M) const {
+    return Impl->getFunctionType(M);
+  }
+  static Function *getFunction(llvm::Module *M, const AMDGPULibFunc &fInfo);
+
+  static Function *getOrInsertFunction(llvm::Module *M,
+                                       const AMDGPULibFunc &fInfo);
+  static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr);
 
 private:
-  EFuncId       FuncId;
-  ENamePrefix   FKind;
-  std::string   Name;
+  /// Initialize as a mangled library function.
+  void initMangled();
+  std::unique_ptr<AMDGPULibFuncImpl> Impl;
+};
+
+class AMDGPUMangledLibFunc : public AMDGPULibFuncImpl {
+public:
+  Param Leads[2];
+
+  explicit AMDGPUMangledLibFunc();
+  explicit AMDGPUMangledLibFunc(EFuncId id,
+                                const AMDGPUMangledLibFunc &copyFrom);
 
-  void          reset();
+  std::string getName() const override;
+  unsigned getNumArgs() const override;
+  FunctionType *getFunctionType(Module &M) const override;
+  static StringRef getUnmangledName(StringRef MangledName);
 
-  std::string   mangleNameItanium() const;
-  bool          parseItanuimName(StringRef& mangledName);
+  bool parseFuncName(StringRef &mangledName) override;
 
-  std::string   mangleName(const StringRef& name) const;
-  bool          parseName(const StringRef& mangledName);
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const AMDGPULibFuncImpl *F) { return F->isMangled(); }
 
-  template <typename Stream>
-  void          writeName(Stream& OS) const;
+  std::string mangle() const override;
+
+private:
+  std::string mangleNameItanium() const;
+
+  std::string mangleName(StringRef Name) const;
+  bool parseUnmangledName(StringRef MangledName);
+
+  template <typename Stream> void writeName(Stream &OS) const;
 };
 
+class AMDGPUUnmangledLibFunc : public AMDGPULibFuncImpl {
+  FunctionType *FuncTy;
+
+public:
+  explicit AMDGPUUnmangledLibFunc();
+  explicit AMDGPUUnmangledLibFunc(StringRef FName, FunctionType *FT) {
+    Name = FName;
+    FuncTy = FT;
+  }
+  std::string getName() const override { return Name; }
+  unsigned getNumArgs() const override;
+  FunctionType *getFunctionType(Module &M) const override { return FuncTy; }
+
+  bool parseFuncName(StringRef &Name) override;
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const AMDGPULibFuncImpl *F) { return !F->isMangled(); }
+
+  std::string mangle() const override { return Name; }
+
+  void setFunctionType(FunctionType *FT) { FuncTy = FT; }
+};
 }
 #endif // _AMDGPU_LIBFUNC_H_
Index: lib/Target/AMDGPU/AMDGPULibFunc.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -65,6 +65,51 @@
    unsigned getNumArgs() const;
 };
 
+// Information about library functions with unmangled names.
+class UnmangledFuncInfo {
+  StringRef const Name;
+  unsigned NumArgs;
+
+  // Table for all lib functions with unmangled names.
+  static const UnmangledFuncInfo Table[];
+
+  // Number of entries in Table.
+  static const unsigned TableSize;
+
+  // Map function name to index.
+  class NameMap : public StringMap<unsigned> {
+  public:
+    NameMap() {
+      for (unsigned I = 0; I != TableSize; ++I)
+        (*this)[Table[I].Name] = I;
+    }
+  };
+  friend class NameMap;
+  static NameMap Map;
+
+public:
+  using ID = AMDGPULibFunc::EFuncId;
+  UnmangledFuncInfo() = default;
+  UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs)
+      : Name(_Name), NumArgs(_NumArgs) {}
+  // Get index to Table by function name.
+  static bool lookup(StringRef Name, ID &Id);
+  static unsigned toIndex(ID Id) {
+    assert(static_cast<unsigned>(Id) >
+               static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED) &&
+           "Invalid unmangled library function");
+    return static_cast<unsigned>(Id) - 1 -
+           static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED);
+  }
+  static ID toFuncId(unsigned Index) {
+    assert(Index < TableSize && "Invalid unmangled library function");
+    return static_cast<ID>(
+        Index + 1 + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED));
+  }
+  static unsigned getNumArgs(ID Id) { return Table[toIndex(Id)].NumArgs; }
+  static StringRef getName(ID Id) { return Table[toIndex(Id)].Name; }
+};
+
 unsigned ManglingRule::getNumArgs() const {
    unsigned I=0;
    while (I < (sizeof Param/sizeof Param[0]) && Param[I]) ++I;
@@ -215,7 +260,6 @@
 { "powr"                            , {1},   {E_ANY,E_COPY}},
 { "prefetch"                        , {1},   {E_CONSTPTR_ANY,EX_SIZET}},
 { "radians"                         , {1},   {E_ANY}},
-{ "read_pipe"                       , {4},   {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
 { "recip"                           , {1},   {E_ANY}},
 { "remainder"                       , {1},   {E_ANY,E_COPY}},
 { "remquo"                          , {1,3}, {E_ANY,E_COPY,E_ANY}},
@@ -283,7 +327,6 @@
 { "write_imagef"                    , {1},   {E_ANY,E_IMAGECOORDS,EX_FLOAT4}},
 { "write_imagei"                    , {1},   {E_ANY,E_IMAGECOORDS,EX_INTV4}},
 { "write_imageui"                   , {1},   {E_ANY,E_IMAGECOORDS,EX_UINTV4}},
-{ "write_pipe"                      , {4},   {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
 { "ncos"                            , {1},   {E_ANY} },
 { "nexp2"                           , {1},   {E_ANY} },
 { "nfma"                            , {1},   {E_ANY, E_COPY, E_COPY} },
@@ -298,6 +341,19 @@
 { "rcbrt"                           , {1},   {E_ANY} },
 };
 
+// Library functions with unmangled name.
+const UnmangledFuncInfo UnmangledFuncInfo::Table[] = {
+    {"__read_pipe_2", 4},
+    {"__read_pipe_4", 6},
+    {"__write_pipe_2", 4},
+    {"__write_pipe_4", 6},
+};
+
+const unsigned UnmangledFuncInfo::TableSize =
+    sizeof(UnmangledFuncInfo::Table) / sizeof(UnmangledFuncInfo::Table[0]);
+
+UnmangledFuncInfo::NameMap UnmangledFuncInfo::Map;
+
 static const struct ManglingRulesMap : public StringMap<int> {
   ManglingRulesMap()
     : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
@@ -461,18 +517,7 @@
 
 } // end anonymous namespace
 
-AMDGPULibFunc::AMDGPULibFunc() {
-  reset();
-}
-
-AMDGPULibFunc::AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom)
-  : FuncId(id) {
-  FKind = copyFrom.FKind;
-  Leads[0] = copyFrom.Leads[0];
-  Leads[1] = copyFrom.Leads[1];
-}
-
-void AMDGPULibFunc::reset() {
+AMDGPUMangledLibFunc::AMDGPUMangledLibFunc() {
   FuncId = EI_NONE;
   FKind = NOPFX;
   Leads[0].reset();
@@ -480,6 +525,19 @@
   Name.clear();
 }
 
+AMDGPUUnmangledLibFunc::AMDGPUUnmangledLibFunc() {
+  FuncId = EI_NONE;
+  FuncTy = nullptr;
+}
+
+AMDGPUMangledLibFunc::AMDGPUMangledLibFunc(
+    EFuncId id, const AMDGPUMangledLibFunc &copyFrom) {
+  FuncId = id;
+  FKind = copyFrom.FKind;
+  Leads[0] = copyFrom.Leads[0];
+  Leads[1] = copyFrom.Leads[1];
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Demangling
 
@@ -508,8 +566,8 @@
   return Pfx;
 }
 
-bool AMDGPULibFunc::parseName(const StringRef& fullName) {
-  FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(fullName));
+bool AMDGPUMangledLibFunc::parseUnmangledName(StringRef FullName) {
+  FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(FullName));
   return FuncId != EI_NONE;
 }
 
@@ -601,10 +659,11 @@
   return true;
 }
 
-bool AMDGPULibFunc::parseItanuimName(StringRef& mangledName) {
+bool AMDGPUMangledLibFunc::parseFuncName(StringRef &mangledName) {
   StringRef Name = eatLengthPrefixedName(mangledName);
   FKind = parseNamePrefix(Name);
-  if (!parseName(Name)) return false;
+  if (!parseUnmangledName(Name))
+    return false;
 
   const ManglingRule& Rule = manglingRules[FuncId];
   ItaniumParamParser Parser;
@@ -619,30 +678,42 @@
   return true;
 }
 
-bool AMDGPULibFunc::parse(StringRef mangledName, AMDGPULibFunc& iInfo) {
-  iInfo.reset();
-  if (mangledName.empty())
+bool AMDGPUUnmangledLibFunc::parseFuncName(StringRef &Name) {
+  if (!UnmangledFuncInfo::lookup(Name, FuncId))
     return false;
+  setName(Name);
+  return true;
+}
 
-  if (eatTerm(mangledName, "_Z")) {
-    return iInfo.parseItanuimName(mangledName);
+bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) {
+  if (FuncName.empty()) {
+    F.Impl = std::unique_ptr<AMDGPULibFuncImpl>();
+    return false;
   }
+
+  if (eatTerm(FuncName, "_Z"))
+    F.Impl = make_unique<AMDGPUMangledLibFunc>();
+  else
+    F.Impl = make_unique<AMDGPUUnmangledLibFunc>();
+  if (F.Impl->parseFuncName(FuncName))
+    return true;
+
+  F.Impl = std::unique_ptr<AMDGPULibFuncImpl>();
   return false;
 }
 
-StringRef AMDGPULibFunc::getUnmangledName(const StringRef& mangledName) {
+StringRef AMDGPUMangledLibFunc::getUnmangledName(StringRef mangledName) {
   StringRef S = mangledName;
   if (eatTerm(S, "_Z"))
     return eatLengthPrefixedName(S);
   return StringRef();
 }
 
-
 ///////////////////////////////////////////////////////////////////////////////
 // Mangling
 
 template <typename Stream>
-void AMDGPULibFunc::writeName(Stream& OS) const {
+void AMDGPUMangledLibFunc::writeName(Stream &OS) const {
   const char *Pfx = "";
   switch (FKind) {
   case NATIVE: Pfx = "native_"; break;
@@ -658,9 +729,7 @@
   }
 }
 
-std::string AMDGPULibFunc::mangle() const {
-  return mangleNameItanium();
-}
+std::string AMDGPUMangledLibFunc::mangle() const { return mangleNameItanium(); }
 
 ///////////////////////////////////////////////////////////////////////////////
 // Itanium Mangling
@@ -788,7 +857,7 @@
 };
 } // namespace
 
-std::string AMDGPULibFunc::mangleNameItanium() const {
+std::string AMDGPUMangledLibFunc::mangleNameItanium() const {
   SmallString<128> Buf;
   raw_svector_ostream S(Buf);
   SmallString<128> NameBuf;
@@ -850,7 +919,7 @@
   return T;
 }
 
-FunctionType* AMDGPULibFunc::getFunctionType(Module& M) const {
+FunctionType *AMDGPUMangledLibFunc::getFunctionType(Module &M) const {
   LLVMContext& C = M.getContext();
   std::vector<Type*> Args;
   ParamIterator I(Leads, manglingRules[FuncId]);
@@ -863,18 +932,22 @@
     Args, false);
 }
 
-unsigned AMDGPULibFunc::getNumArgs() const {
+unsigned AMDGPUMangledLibFunc::getNumArgs() const {
   return manglingRules[FuncId].getNumArgs();
 }
 
-std::string AMDGPULibFunc::getName() const {
+unsigned AMDGPUUnmangledLibFunc::getNumArgs() const {
+  return UnmangledFuncInfo::getNumArgs(FuncId);
+}
+
+std::string AMDGPUMangledLibFunc::getName() const {
   SmallString<128> Buf;
   raw_svector_ostream OS(Buf);
   writeName(OS);
   return OS.str();
 }
 
-Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc& fInfo) {
+Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
   std::string FuncName = fInfo.mangle();
   Function *F = dyn_cast_or_null<Function>(
     M->getValueSymbolTable().lookup(FuncName));
@@ -889,7 +962,7 @@
 }
 
 Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
-                                             const AMDGPULibFunc& fInfo) {
+                                             const AMDGPULibFunc &fInfo) {
   std::string const FuncName = fInfo.mangle();
   Function *F = dyn_cast_or_null<Function>(
     M->getValueSymbolTable().lookup(FuncName));
@@ -929,3 +1002,52 @@
 
   return cast<Function>(C);
 }
+
+bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) {
+  auto Loc = Map.find(Name);
+  if (Loc != Map.end()) {
+    Id = toFuncId(Loc->second);
+    return true;
+  }
+  Id = AMDGPULibFunc::EI_NONE;
+  return false;
+}
+
+AMDGPULibFunc::AMDGPULibFunc(const AMDGPULibFunc &F) {
+  if (auto *MF = dyn_cast<AMDGPUMangledLibFunc>(F.Impl.get()))
+    Impl.reset(new AMDGPUMangledLibFunc(*MF));
+  else if (auto *UMF = dyn_cast<AMDGPUUnmangledLibFunc>(F.Impl.get()))
+    Impl.reset(new AMDGPUUnmangledLibFunc(*UMF));
+  else
+    Impl = std::unique_ptr<AMDGPULibFuncImpl>();
+}
+
+AMDGPULibFunc &AMDGPULibFunc::operator=(const AMDGPULibFunc &F) {
+  if (this == &F)
+    return *this;
+  new (this) AMDGPULibFunc(F);
+  return *this;
+}
+
+AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom) {
+  assert(AMDGPULibFuncBase::isMangled(Id) && CopyFrom.isMangled() &&
+         "not supported");
+  Impl.reset(new AMDGPUMangledLibFunc(
+      Id, *cast<AMDGPUMangledLibFunc>(CopyFrom.Impl.get())));
+}
+
+AMDGPULibFunc::AMDGPULibFunc(StringRef Name, FunctionType *FT) {
+  Impl.reset(new AMDGPUUnmangledLibFunc(Name, FT));
+}
+
+void AMDGPULibFunc::initMangled() { Impl.reset(new AMDGPUMangledLibFunc()); }
+
+AMDGPULibFunc::Param *AMDGPULibFunc::getLeads() {
+  if (!Impl)
+    initMangled();
+  return cast<AMDGPUMangledLibFunc>(Impl.get())->Leads;
+}
+
+const AMDGPULibFunc::Param *AMDGPULibFunc::getLeads() const {
+  return cast<const AMDGPUMangledLibFunc>(Impl.get())->Leads;
+}
Index: test/CodeGen/AMDGPU/simplify-libcalls.ll
===================================================================
--- test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -1,6 +1,6 @@
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -instnamer <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink -instnamer <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink -instnamer <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
 ; GCN-POSTLINK: tail call fast float @_Z3sinf(
@@ -299,8 +299,8 @@
 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
-; GCN: %0 = fmul fast float %__powx21, %__powx21
-; GCN: %__powprod3 = fmul fast float %0, %__powx22
+; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
@@ -314,8 +314,8 @@
 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
-; GCN: %0 = fmul fast float %__powx21, %__powx21
-; GCN: %__powprod3 = fmul fast float %0, %__powx22
+; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
@@ -331,8 +331,8 @@
 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
-; GCN: %0 = fmul fast float %__powx21, %__powx21
-; GCN: %__powprod3 = fmul fast float %0, %__powx22
+; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
@@ -350,12 +350,12 @@
 ; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
 ; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
-; GCN-PRELINK: %0 = bitcast float %tmp to i32
-; GCN-PRELINK: %__pow_sign = and i32 %0, -2147483648
-; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
-; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
-; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
-; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648
+; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
+; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
+; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
 define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
@@ -393,12 +393,12 @@
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
 ; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: %__yeven = shl i32 %conv, 31
-; GCN-PRELINK: %0 = bitcast float %tmp to i32
-; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %0
-; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
-; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
-; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
-; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]]
+; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
+; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
+; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
 define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
@@ -692,3 +692,96 @@
 }
 
 declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)
+
+%opencl.pipe_t = type opaque
+%opencl.reserve_id_t = type opaque
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND:[0-9]+]]
+; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
+entry:
+  %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
+  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
+  %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+  %tmp3 = tail call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
+  %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+  tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4)
+  ret void
+}
+
+declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32)
+
+declare %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
+
+declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32)
+
+declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
+; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
+entry:
+  %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
+  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
+  %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+  %tmp3 = tail call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
+  %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+  tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4) #0
+  ret void
+}
+
+declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+
+declare %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
+
+declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+
+declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) local_unnamed_addr
+
+%struct.S = type { [100 x i32] }
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
+; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64> addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8 addrspace(4)* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
+define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
+entry:
+  %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8 addrspace(4)*
+  %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(4)* %tmp, i32 1, i32 1) #0
+  %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
+  %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8 addrspace(4)*
+  %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8 addrspace(4)* %tmp3, i32 2, i32 2) #0
+  %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
+  %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8 addrspace(4)*
+  %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8 addrspace(4)* %tmp6, i32 4, i32 4) #0
+  %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
+  %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8 addrspace(4)*
+  %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8 addrspace(4)* %tmp9, i32 8, i32 8) #0
+  %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
+  %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8 addrspace(4)*
+  %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8 addrspace(4)* %tmp12, i32 16, i32 16) #0
+  %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
+  %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8 addrspace(4)*
+  %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8 addrspace(4)* %tmp15, i32 32, i32 32) #0
+  %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
+  %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8 addrspace(4)*
+  %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8 addrspace(4)* %tmp18, i32 64, i32 64) #0
+  %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
+  %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8 addrspace(4)*
+  %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8 addrspace(4)* %tmp21, i32 128, i32 128) #0
+  %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
+  %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8 addrspace(4)*
+  %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8 addrspace(4)* %tmp24, i32 400, i32 4) #0
+  ret void
+}
+
+; CGN-PRELINK: attributes #[[NOUNWIND]] = { nounwind }
+attributes #0 = { nounwind }