diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -106,6 +106,8 @@
       EmitGeneric = AP.EmitGeneric;
     }
 
+    // Copy Num bytes from Ptr.
+    // if Bytes > Num, zero fill up to Bytes.
     unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
       assert((curpos + Num) <= size);
       assert((curpos + Bytes) <= size);
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1748,135 +1748,63 @@
   llvm_unreachable("Not scalar type found in printScalarConstant()");
 }
 
-// These utility functions assure we get the right sequence of bytes for a given
-// type even for big-endian machines
-template <typename T> static void ConvertIntToBytes(unsigned char *p, T val) {
-  int64_t vp = (int64_t)val;
-  for (unsigned i = 0; i < sizeof(T); ++i) {
-    p[i] = (unsigned char)vp;
-    vp >>= 8;
-  }
-}
-static void ConvertFloatToBytes(unsigned char *p, float val) {
-  int32_t *vp = (int32_t *)&val;
-  for (unsigned i = 0; i < sizeof(int32_t); ++i) {
-    p[i] = (unsigned char)*vp;
-    *vp >>= 8;
-  }
-}
-static void ConvertDoubleToBytes(unsigned char *p, double val) {
-  int64_t *vp = (int64_t *)&val;
-  for (unsigned i = 0; i < sizeof(int64_t); ++i) {
-    p[i] = (unsigned char)*vp;
-    *vp >>= 8;
-  }
-}
-
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
-                                   AggBuffer *aggBuffer) {
+                                   AggBuffer *AggBuffer) {
   const DataLayout &DL = getDataLayout();
-
+  int AllocSize = DL.getTypeAllocSize(CPV->getType());
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
-    int s = DL.getTypeAllocSize(CPV->getType());
-    if (s < Bytes)
-      s = Bytes;
-    aggBuffer->addZeros(s);
+    // Non-zero Bytes indicates that we need to zero-fill everything. Otherwise,
+    // only the space allocated by CPV.
+    AggBuffer->addZeros(Bytes ? Bytes : AllocSize);
     return;
   }
 
-  unsigned char ptr[8];
-  switch (CPV->getType()->getTypeID()) {
+  // Helper for filling AggBuffer with APInts.
+  auto AddIntToBuffer = [AggBuffer, Bytes](const APInt &Val) {
+    size_t NumBytes = (Val.getBitWidth() + 7) / 8;
+    SmallVector<unsigned char, 16> Buf(NumBytes);
+    for (unsigned I = 0; I < NumBytes; ++I) {
+      Buf[I] = Val.extractBitsAsZExtValue(8, I * 8);
+    }
+    AggBuffer->addBytes(Buf.data(), NumBytes, Bytes);
+  };
 
-  case Type::IntegerTyID: {
-    Type *ETy = CPV->getType();
-    if (ETy == Type::getInt8Ty(CPV->getContext())) {
-      unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
-      ConvertIntToBytes<>(ptr, c);
-      aggBuffer->addBytes(ptr, 1, Bytes);
-    } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
-      short int16 = (short)cast<ConstantInt>(CPV)->getZExtValue();
-      ConvertIntToBytes<>(ptr, int16);
-      aggBuffer->addBytes(ptr, 2, Bytes);
-    } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
-      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
-        int int32 = (int)(constInt->getZExtValue());
-        ConvertIntToBytes<>(ptr, int32);
-        aggBuffer->addBytes(ptr, 4, Bytes);
+  switch (CPV->getType()->getTypeID()) {
+  case Type::IntegerTyID:
+    if (const auto CI = dyn_cast<ConstantInt>(CPV)) {
+      AddIntToBuffer(CI->getValue());
+      break;
+    }
+    if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      if (const auto *CI =
+              dyn_cast<ConstantInt>(ConstantFoldConstant(Cexpr, DL))) {
+        AddIntToBuffer(CI->getValue());
         break;
-      } else if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (const auto *constInt = dyn_cast<ConstantInt>(
-                ConstantFoldConstant(Cexpr, DL))) {
-          int int32 = (int)(constInt->getZExtValue());
-          ConvertIntToBytes<>(ptr, int32);
-          aggBuffer->addBytes(ptr, 4, Bytes);
-          break;
-        }
-        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
-          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
-          aggBuffer->addSymbol(v, Cexpr->getOperand(0));
-          aggBuffer->addZeros(4);
-          break;
-        }
       }
-      llvm_unreachable("unsupported integer const type");
-    } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
-      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
-        long long int64 = (long long)(constInt->getZExtValue());
-        ConvertIntToBytes<>(ptr, int64);
-        aggBuffer->addBytes(ptr, 8, Bytes);
+      if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+        Value *V = Cexpr->getOperand(0)->stripPointerCasts();
+        AggBuffer->addSymbol(V, Cexpr->getOperand(0));
+        AggBuffer->addZeros(AllocSize);
         break;
-      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (const auto *constInt = dyn_cast<ConstantInt>(
-                ConstantFoldConstant(Cexpr, DL))) {
-          long long int64 = (long long)(constInt->getZExtValue());
-          ConvertIntToBytes<>(ptr, int64);
-          aggBuffer->addBytes(ptr, 8, Bytes);
-          break;
-        }
-        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
-          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
-          aggBuffer->addSymbol(v, Cexpr->getOperand(0));
-          aggBuffer->addZeros(8);
-          break;
-        }
       }
-      llvm_unreachable("unsupported integer const type");
-    } else
-      llvm_unreachable("unsupported integer const type");
+    }
+    llvm_unreachable("unsupported integer const type");
     break;
-  }
+
   case Type::HalfTyID:
   case Type::FloatTyID:
-  case Type::DoubleTyID: {
-    const auto *CFP = cast<ConstantFP>(CPV);
-    Type *Ty = CFP->getType();
-    if (Ty == Type::getHalfTy(CPV->getContext())) {
-      APInt API = CFP->getValueAPF().bitcastToAPInt();
-      uint16_t float16 = API.getLoBits(16).getZExtValue();
-      ConvertIntToBytes<>(ptr, float16);
-      aggBuffer->addBytes(ptr, 2, Bytes);
-    } else if (Ty == Type::getFloatTy(CPV->getContext())) {
-      float float32 = (float) CFP->getValueAPF().convertToFloat();
-      ConvertFloatToBytes(ptr, float32);
-      aggBuffer->addBytes(ptr, 4, Bytes);
-    } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
-      double float64 = CFP->getValueAPF().convertToDouble();
-      ConvertDoubleToBytes(ptr, float64);
-      aggBuffer->addBytes(ptr, 8, Bytes);
-    } else {
-      llvm_unreachable("unsupported fp const type");
-    }
+  case Type::DoubleTyID:
+    AddIntToBuffer(cast<ConstantFP>(CPV)->getValueAPF().bitcastToAPInt());
     break;
-  }
+
   case Type::PointerTyID: {
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-      aggBuffer->addSymbol(GVar, GVar);
+      AggBuffer->addSymbol(GVar, GVar);
     } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
       const Value *v = Cexpr->stripPointerCasts();
-      aggBuffer->addSymbol(v, Cexpr);
+      AggBuffer->addSymbol(v, Cexpr);
     }
-    unsigned int s = DL.getTypeAllocSize(CPV->getType());
-    aggBuffer->addZeros(s);
+    AggBuffer->addZeros(AllocSize);
     break;
   }
 
@@ -1884,12 +1812,11 @@
   case Type::FixedVectorTyID:
   case Type::StructTyID: {
     if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) {
-      int ElementSize = DL.getTypeAllocSize(CPV->getType());
-      bufferAggregateConstant(CPV, aggBuffer);
-      if (Bytes > ElementSize)
-        aggBuffer->addZeros(Bytes - ElementSize);
+      bufferAggregateConstant(CPV, AggBuffer);
+      if (Bytes > AllocSize)
+        AggBuffer->addZeros(Bytes - AllocSize);
     } else if (isa<ConstantAggregateZero>(CPV))
-      aggBuffer->addZeros(Bytes);
+      AggBuffer->addZeros(Bytes);
     else
       llvm_unreachable("Unexpected Constant type");
     break;
diff --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -15,9 +15,16 @@
 ; CHECK-DAG: .b8 Gbli64[16] = {137, 103, 69, 35, 1, 239, 205, 171, 239, 205, 171, 137, 103, 69, 35, 1};
 @Gbli64 = global [2 x i64] [i64 12379813738877118345, i64 81985529216486895]
 
+; CHECK-DAG: .b8 Gbli128[32] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+@Gbli128 = global [2 x i128] [i128  1339673755198158349044581307228491536, i128 21345817372864405881847059188222722561]
+
 ; CHECK-DAG: .b8 Gblf32[8] = {192, 225, 100, 75, 0, 96, 106, 69};
 @Gblf32 = global [2 x float] [float 1.5e+7, float 3.75e+3]
 
 ; CHECK-DAG: .b8 Gblf64[16] = {116, 10, 181, 48, 134, 62, 230, 58, 106, 222, 138, 98, 204, 250, 200, 75};
 @Gblf64 = global [2 x double] [double 5.75e-25, double 12.25e+56]
 
+; Make sure we fill in alignment gaps correctly.
+; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0};
+@GblU = global {i16, i32, i8} {i16 1543, i32 33752069, i8 1}
+