Index: mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
===================================================================
--- mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
+++ mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -8,6 +8,7 @@
 #ifndef MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
 #define MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
 
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include <memory>
 
 namespace mlir {
@@ -24,9 +25,11 @@
 void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                          OwningRewritePatternList &patterns);
 
-/// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
-std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-createLowerGpuOpsToNVVMOpsPass();
+/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The
+/// index bitwidth used for the lowering of the device side index computations
+/// is configurable.
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass(
+    unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout);
 
 } // namespace mlir
 
Index: mlir/include/mlir/Conversion/Passes.td
===================================================================
--- mlir/include/mlir/Conversion/Passes.td
+++ mlir/include/mlir/Conversion/Passes.td
@@ -94,6 +94,11 @@
 def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
   let summary = "Generate NVVM operations for gpu operations";
   let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
+  let options = [
+    Option<"indexBitwidth", "index-bitwidth", "unsigned",
+           /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
+           "Bitwidth of the index type, 0 to use size of machine word">
+  ];
 }
 
 //===----------------------------------------------------------------------===//
Index: mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
===================================================================
--- mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -30,7 +30,6 @@
 
 namespace {
 
-
 struct GPUShuffleOpLowering : public ConvertToLLVMPattern {
   explicit GPUShuffleOpLowering(LLVMTypeConverter &lowering_)
       : ConvertToLLVMPattern(gpu::ShuffleOp::getOperationName(),
@@ -97,17 +96,25 @@
 ///
 /// This pass only handles device code and is not meant to be run on GPU host
 /// code.
-class LowerGpuOpsToNVVMOpsPass
+struct LowerGpuOpsToNVVMOpsPass
     : public ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
-public:
+  LowerGpuOpsToNVVMOpsPass() = default;
+  LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
+    this->indexBitwidth = indexBitwidth;
+  }
+
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
 
+    /// Customize the bitwidth used for the device side index computations
+    LLVMTypeConverterCustomization customs;
+    customs.indexBitwidth = indexBitwidth;
+
     /// MemRef conversion for GPU to NVVM lowering. The GPU dialect uses memory
     /// space 5 for private memory attributions, but NVVM represents private
     /// memory allocations as local `alloca`s in the default address space. This
     /// converter drops the private memory space to support the use case above.
-    LLVMTypeConverter converter(m.getContext());
+    LLVMTypeConverter converter(m.getContext(), customs);
     converter.addConversion([&](MemRefType type) -> Optional<Type> {
       if (type.getMemorySpace() != gpu::GPUDialect::getPrivateAddressSpace())
         return llvm::None;
@@ -176,6 +183,6 @@
 }
 
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToNVVMOpsPass() {
-  return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
+mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
+  return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth);
 }
Index: mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
===================================================================
--- mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1,36 +1,52 @@
 // RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s --dump-input-on-failure
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
+  // CHECK32-LABEL: func @gpu_index_ops()
   func @gpu_index_ops()
       -> (index, index, index, index, index, index,
           index, index, index, index, index, index) {
+    // CHECK32-NOT: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
+    
     // CHECK: = nvvm.read.ptx.sreg.tid.x : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.tid.y : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.tid.z : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
 
     // CHECK: = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
 
     // CHECK: = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
 
     // CHECK: = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
+    // CHECK: = llvm.sext %{{.*}} : !llvm.i32 to !llvm.i64
     %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
 
     std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
@@ -42,6 +58,21 @@
 
 // -----
 
+gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_index_comp
+  // CHECK32-LABEL: func @gpu_index_comp
+  func @gpu_index_comp(%idx : index) -> index {
+    // CHECK: = llvm.add %{{.*}}, %{{.*}} : !llvm.i64
+    // CHECK32: = llvm.add %{{.*}}, %{{.*}} : !llvm.i32 
+    %0 = addi %idx, %idx : index
+    // CHECK: llvm.return %{{.*}} : !llvm.i64
+    // CHECK32: llvm.return %{{.*}} : !llvm.i32     
+    std.return %0 : index
+  }
+}
+
+// -----
+
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_all_reduce_op()
   gpu.func @gpu_all_reduce_op() {