Index: lib/Target/NVPTX/NVPTXAsmPrinter.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -400,7 +400,7 @@
   O << " (";
 
   if (isABI) {
-    if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
+    if ((Ty->isFloatingPointTy() || Ty->isIntegerTy()) && !Ty->isIntegerTy(128)) {
       unsigned size = 0;
       if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
         size = ITy->getBitWidth();
@@ -418,7 +418,7 @@
     } else if (isa<PointerType>(Ty)) {
       O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits()
         << " func_retval0";
-    } else if (Ty->isAggregateType() || Ty->isVectorTy()) {
+    } else if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
       unsigned totalsz = DL.getTypeAllocSize(Ty);
       unsigned retAlignment = 0;
       if (!getAlign(*F, 0, retAlignment))
@@ -1425,6 +1425,14 @@
   else
     O << " .align " << GVar->getAlignment();
 
+  // Special case for i128
+  if (ETy->isIntegerTy(128)) {
+    O << " .b8 ";
+    getSymbol(GVar)->print(O, MAI);
+    O << "[16]";
+    return;
+  }
+
   if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
@@ -1551,7 +1559,7 @@
     }
 
     if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal)) {
-      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+      if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // Just print .param .align <a> .b8 .param[size];
         // <a> = PAL.getparamalignment
         // size = typeallocsize of element type
Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -169,6 +169,19 @@
   SmallVector<EVT, 16> TempVTs;
   SmallVector<uint64_t, 16> TempOffsets;
 
+  // Special case for i128 - decompose to (i64, i64)
+  if (Ty->isIntegerTy(128)) {
+    ValueVTs.push_back(EVT(MVT::i64));
+    ValueVTs.push_back(EVT(MVT::i64));
+
+    if (Offsets) {
+      Offsets->push_back(StartingOffset + 0);
+      Offsets->push_back(StartingOffset + 8);
+    }
+
+    return;
+  }
+
   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     EVT VT = TempVTs[i];
@@ -1263,7 +1276,7 @@
     O << "()";
   } else {
     O << "(";
-    if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
+    if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && !retTy->isIntegerTy(128)) {
       unsigned size = 0;
       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
         size = ITy->getBitWidth();
@@ -1281,7 +1294,7 @@
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
       O << ".param .b" << PtrVT.getSizeInBits() << " _";
-    } else if (retTy->isAggregateType() || retTy->isVectorTy()) {
+    } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
       auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
       O << ".param .align " << retAlignment << " .b8 _["
         << DL.getTypeAllocSize(retTy) << "]";
@@ -1303,7 +1316,7 @@
     first = false;
 
     if (!Outs[OIdx].Flags.isByVal()) {
-      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+      if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         unsigned align = 0;
         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
         // +1 because index 0 is reserved for return type alignment
@@ -1459,7 +1472,7 @@
       unsigned AllocSize = DL.getTypeAllocSize(Ty);
       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       bool NeedAlign; // Does argument declaration specify alignment?
-      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+      if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // declare .param .align <align> .b8 .param<n>[<size>];
         SDValue DeclareParamOps[] = {
             Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
@@ -1635,8 +1648,8 @@
     // these three types to match the logic in
     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
     // Plus, this behavior is consistent with nvcc's.
-    if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() ||
-        RetTy->isPointerTy()) {
+    if ((RetTy->isFloatingPointTy() || RetTy->isIntegerTy() ||
+        RetTy->isPointerTy()) && !RetTy->isIntegerTy(128)) {
       // Scalar needs to be at least 32bit wide
       if (resultsz < 32)
         resultsz = 32;
@@ -2367,7 +2380,7 @@
 
     if (theArgs[i]->use_empty()) {
       // argument is dead
-      if (Ty->isAggregateType()) {
+      if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
         SmallVector<EVT, 16> vtparts;
 
         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
Index: test/CodeGen/NVPTX/i128-param.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/i128-param.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; CHECK-LABEL: .visible .func callee(
+; CHECK-NEXT: .param .align 8 .b8 callee_param_0[16],
+; CHECK-NEXT: .param .align 8 .b8 callee_param_1[16],
+define void @callee(i128, i128, i128*) {
+  ; CHECK-DAG: ld.param.u64 %[[REG0:rd[0-9]+]], [callee_param_0];
+  ; CHECK-DAG: ld.param.u64 %[[REG1:rd[0-9]+]], [callee_param_0+8];
+  ; CHECK-DAG: ld.param.u64 %[[REG2:rd[0-9]+]], [callee_param_1];
+  ; CHECK-DAG: ld.param.u64 %[[REG3:rd[0-9]+]], [callee_param_1+8];
+
+  ; CHECK:      mul.lo.s64 %[[REG4:rd[0-9]+]], %[[REG0]], %[[REG3]];
+	; CHECK-NEXT: mul.hi.u64 %[[REG5:rd[0-9]+]], %[[REG0]], %[[REG2]];
+	; CHECK-NEXT: add.s64    %[[REG6:rd[0-9]+]], %[[REG5]], %[[REG4]];
+	; CHECK-NEXT: mul.lo.s64 %[[REG7:rd[0-9]+]], %[[REG1]], %[[REG2]];
+	; CHECK-NEXT: add.s64    %[[REG8:rd[0-9]+]], %[[REG6]], %[[REG7]];
+	; CHECK-NEXT: mul.lo.s64 %[[REG9:rd[0-9]+]], %[[REG0]], %[[REG2]];
+  %a = mul i128 %0, %1
+
+  store i128 %a, i128* %2
+  ret void
+}
+
+; CHECK-LABEL: .visible .entry caller_kernel(
+; CHECK-NEXT: .param .align 8 .b8 caller_kernel_param_0[16],
+; CHECK-NEXT: .param .align 8 .b8 caller_kernel_param_1[16],
+define ptx_kernel void @caller_kernel(i128, i128, i128*) {
+start:
+  ; CHECK-DAG: ld.param.u64 %[[REG0:rd[0-9]+]], [caller_kernel_param_0];
+  ; CHECK-DAG: ld.param.u64 %[[REG1:rd[0-9]+]], [caller_kernel_param_0+8];
+  ; CHECK-DAG: ld.param.u64 %[[REG2:rd[0-9]+]], [caller_kernel_param_1];
+  ; CHECK-DAG: ld.param.u64 %[[REG3:rd[0-9]+]], [caller_kernel_param_1+8];
+
+  ; CHECK:      { // callseq [[CALLSEQ_ID:[0-9]]], 0
+	; CHECK:      .param .align 8 .b8 param0[16];
+	; CHECK-NEXT: st.param.b64 [param0+0], %[[REG0]];
+	; CHECK-NEXT: st.param.b64 [param0+8], %[[REG1]];
+	; CHECK:      .param .align 8 .b8 param1[16];
+	; CHECK-NEXT: st.param.b64 [param1+0], %[[REG2]];
+	; CHECK-NEXT: st.param.b64 [param1+8], %[[REG3]];
+	; CHECK:      } // callseq [[CALLSEQ_ID]]
+  call void @callee(i128 %0, i128 %1, i128* %2)
+
+  ret void
+}
+
+; CHECK-LABEL: .visible .func caller_func(
+; CHECK-NEXT: .param .align 8 .b8 caller_func_param_0[16],
+; CHECK-NEXT: .param .align 8 .b8 caller_func_param_1[16],
+define void @caller_func(i128, i128, i128*) {
+start:
+  ; CHECK-DAG: ld.param.u64 %[[REG0:rd[0-9]+]], [caller_func_param_0];
+  ; CHECK-DAG: ld.param.u64 %[[REG1:rd[0-9]+]], [caller_func_param_0+8];
+  ; CHECK-DAG: ld.param.u64 %[[REG2:rd[0-9]+]], [caller_func_param_1];
+  ; CHECK-DAG: ld.param.u64 %[[REG3:rd[0-9]+]], [caller_func_param_1+8];
+
+  ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
+	; CHECK: .param .align 8 .b8 param0[16];
+	; CHECK: st.param.b64 [param0+0], %[[REG0]];
+	; CHECK: st.param.b64 [param0+8], %[[REG1]];
+	; CHECK: .param .align 8 .b8 param1[16];
+	; CHECK: st.param.b64 [param1+0], %[[REG2]];
+	; CHECK: st.param.b64 [param1+8], %[[REG3]];
+	; CHECK: } // callseq [[CALLSEQ_ID]]
+  call void @callee(i128 %0, i128 %1, i128* %2)
+
+  ret void
+}
Index: test/CodeGen/NVPTX/i128-retval.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/i128-retval.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; CHECK-LABEL: .visible .func (.param .align 8 .b8 func_retval0[16]) callee(
+define i128 @callee(i128) {
+  ; CHECK-DAG: ld.param.u64 %[[REG0:rd[0-9]+]], [callee_param_0];
+  ; CHECK-DAG: ld.param.u64 %[[REG1:rd[0-9]+]], [callee_param_0+8];
+
+	; CHECK: st.param.b64 [func_retval0+0], %[[REG0]];
+	; CHECK: st.param.b64 [func_retval0+8], %[[REG1]];
+  ret i128 %0
+}
+
+; CHECK-LABEL: .visible .func caller(
+define void @caller(i128, i128*) {
+start:
+  ; CHECK-DAG: ld.param.u64 %[[REG0:rd[0-9]+]], [caller_param_0];
+  ; CHECK-DAG: ld.param.u64 %[[REG1:rd[0-9]+]], [caller_param_0+8];
+  ; CHECK-DAG: ld.param.u64 %[[OUT:rd[0-9]+]],  [caller_param_1];
+
+  ; CHECK: { // callseq 0, 0
+	; CHECK: .param .align 8 .b8 retval0[16];
+	; CHECK: call.uni (retval0),
+  ; CHECK: ld.param.b64 %[[REG2:rd[0-9]+]], [retval0+0];
+	; CHECK: ld.param.b64 %[[REG3:rd[0-9]+]], [retval0+8];
+	; CHECK: } // callseq 0
+  %a = call i128 @callee(i128 %0)
+
+	; CHECK-DAG: st.u64 [%[[OUT]]], %[[REG2]];
+	; CHECK-DAG: st.u64 [%[[OUT]]+8], %[[REG3]];
+  store i128 %a, i128* %1
+
+  ret void
+}