Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.h
===================================================================
--- lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -93,6 +93,7 @@
 
   bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
 
+  unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned);
 };
 } // end namespace llvm
 
Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2062,61 +2062,33 @@
   //
   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
   //
-  // Since we load an i8 value, the matching logic above will have selected an
-  // LDG instruction that reads i8 and stores it in an i16 register (NVPTX does
-  // not expose 8-bit registers):
-  //
-  //   i16,ch = INT_PTX_LDG_GLOBAL_i8areg64 t7, t0
-  //
-  // To get the correct type in this case, truncate back to i8 and then extend
-  // to the original load type.
-  EVT OrigType = N->getValueType(0);
-  LoadSDNode *LDSD = dyn_cast<LoadSDNode>(N);
-  if (LDSD && EltVT == MVT::i8 && OrigType.getScalarSizeInBits() >= 32) {
-    unsigned CvtOpc = 0;
-
-    switch (LDSD->getExtensionType()) {
-    default:
-      llvm_unreachable("An extension is required for i8 loads");
-      break;
-    case ISD::SEXTLOAD:
-      switch (OrigType.getSimpleVT().SimpleTy) {
-      default:
-        llvm_unreachable("Unhandled integer load type");
-        break;
-      case MVT::i32:
-        CvtOpc = NVPTX::CVT_s32_s8;
-        break;
-      case MVT::i64:
-        CvtOpc = NVPTX::CVT_s64_s8;
-        break;
-      }
-      break;
-    case ISD::EXTLOAD:
-    case ISD::ZEXTLOAD:
-      switch (OrigType.getSimpleVT().SimpleTy) {
-      default:
-        llvm_unreachable("Unhandled integer load type");
-        break;
-      case MVT::i32:
-        CvtOpc = NVPTX::CVT_u32_u8;
-        break;
-      case MVT::i64:
-        CvtOpc = NVPTX::CVT_u64_u8;
-        break;
-      }
-      break;
-    }
+  // In this case, the matching logic above will select a load for the original
+  // memory type (in this case, i8) and our types will not match (the node needs
+  // to return an i32 in this case). Our LDG/LDU nodes do not support the
+  // concept of sign-/zero-extension, so emulate it here by adding an explicit
+  // CVT instruction. Ptxas should clean up any redundancies here.
 
-    // For each output value, truncate to i8 (since the upper 8 bits are
-    // undefined) and then extend to the desired type.
+  EVT OrigType = N->getValueType(0);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
+
+  if (OrigType != EltVT && LdNode) {
+    // We have an extending-load. The instruction we selected operates on the
+    // smaller type, but the SDNode we are replacing has the larger type. We
+    // need to emit a CVT to make the types match.
+    bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
+    unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
+                                       EltVT.getSimpleVT(), IsSigned);
+
+    // For each output value, apply the manual sign/zero-extension and make sure
+    // all users of the load go through that CVT.
     for (unsigned i = 0; i != NumElts; ++i) {
       SDValue Res(LD, i);
       SDValue OrigVal(N, i);
 
       SDNode *CvtNode =
         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
-                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32));
+                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+                                                         DL, MVT::i32));
       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
     }
   }
@@ -5199,3 +5171,82 @@
   }
   return true;
 }
+
+/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
+/// conversion from \p SrcTy to \p DestTy.
+unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
+                                             bool IsSigned) {
+  unsigned CvtOpc = 0;
+
+  switch (SrcTy.SimpleTy) {
+  default:
+    llvm_unreachable("Unhandled source type");
+    break;
+  case MVT::i8:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+      break;
+    case MVT::i16:
+      CvtOpc = IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
+      break;
+    case MVT::i32:
+      CvtOpc = IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
+      break;
+    case MVT::i64:
+      CvtOpc = IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
+      break;
+    }
+    break;
+  case MVT::i16:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+      break;
+    case MVT::i8:
+      CvtOpc = IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
+      break;
+    case MVT::i32:
+      CvtOpc = IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
+      break;
+    case MVT::i64:
+      CvtOpc = IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
+      break;
+    }
+    break;
+  case MVT::i32:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+      break;
+    case MVT::i8:
+      CvtOpc = IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
+      break;
+    case MVT::i16:
+      CvtOpc = IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
+      break;
+    case MVT::i64:
+      CvtOpc = IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
+      break;
+    }
+    break;
+  case MVT::i64:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+      break;
+    case MVT::i8:
+      CvtOpc = IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
+      break;
+    case MVT::i16:
+      CvtOpc = IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
+      break;
+    case MVT::i32:
+      CvtOpc = IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
+      break;
+    }
+    break;
+  }
+
+  return CvtOpc;
+}
Index: lib/Target/NVPTX/NVPTXInstrInfo.td
===================================================================
--- lib/Target/NVPTX/NVPTXInstrInfo.td
+++ lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -377,6 +377,8 @@
   }
 
   // Generate cvts from all types to all types.
+  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
+  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
   defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
   defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
   defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
Index: test/CodeGen/NVPTX/bug26185-2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/bug26185-2.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; Verify that we correctly emit code for extending ldg/ldu. We do not expose
+; extending variants in the backend, but the ldg/ldu selection code may pick
+; extending loads as candidates. We do want to support this, so make sure we
+; emit the necessary cvt.* instructions to implement the extension and let ptxas
+; emit the real extending loads.
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: spam
+define ptx_kernel void @spam(i8 addrspace(1)* noalias nocapture readonly %arg, i8 addrspace(1)* noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
+bb:
+  %tmp = bitcast i8 addrspace(1)* %arg to i16 addrspace(1)*
+  %tmp4 = bitcast i8 addrspace(1)* %arg1 to i64 addrspace(1)*
+  %tmp5 = add nsw i64 %arg3, 8
+  %tmp6 = getelementptr i16, i16 addrspace(1)* %tmp, i64 %tmp5
+; CHECK: ld.global.nc.u16
+  %tmp7 = load i16, i16 addrspace(1)* %tmp6, align 2
+; CHECK: cvt.s32.s16
+  %tmp8 = sext i16 %tmp7 to i64
+  %tmp9 = mul nsw i64 %tmp8, %tmp8
+  %tmp10 = load i64, i64 addrspace(1)* %tmp4, align 8
+  %tmp11 = add nsw i64 %tmp9, %tmp10
+  store i64 %tmp11, i64 addrspace(1)* %tmp4, align 8
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "polly.skip.fn" }
+
+!nvvm.annotations = !{!0}
+
+!0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i64, i64)* @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}