Index: lib/Target/NVPTX/NVPTXInstrInfo.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -37,8 +37,19 @@
   const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
 
-  if (DestRC->getSize() != SrcRC->getSize())
+  if (DestRC->getSize() != SrcRC->getSize()) {
+    // If the sizes differ it may be possible we are copying a i16 to a i32
+    // register.
+    if (DestRC == &NVPTX::Int32RegsRegClass &&
+        SrcRC == &NVPTX::Int16RegsRegClass) {
+      MBB.dump();
+      BuildMI(MBB, I, DL, get(NVPTX::CVT_u32_u16), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0);
+      return;
+    }
     report_fatal_error("Copy one register into another with a different width");
+  }
 
   unsigned Op;
   if (DestRC == &NVPTX::Int1RegsRegClass) {
Index: test/CodeGen/NVPTX/reg-copy-int.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/reg-copy-int.ll
@@ -0,0 +1,341 @@
+; RUN: llc < %s -O3 -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+@__omptgt__ControlState = common addrspace(3) global [2 x i32] zeroinitializer
+@__omptgt__CudaThreadsInParallel = common addrspace(3) global i32 0
+@__omptgt__SimdNumLanes = common addrspace(3) global i32 0
+@__omptgt__0_1045bf9_30__thread_limit = global i32 0
+@.lck. = private global [8 x i32] zeroinitializer
+@__omptgt__0_1045bf9_30__simd_info = constant i8 1
+@__omptgt__shared_data_ = common addrspace(3) global [0 x i8] zeroinitializer
+
+; CHECK-LABEL __omptgt__0_1045bf9_30_(
+; CHECK: ld.global.nc.u8 {{.*}}[[r1:%.+]], [%r{{.+}}];
+; CHECK: cvt.u32.u16 {{.*}}%r{{.+}}, [[r1]];
+; CHECK: ld.global.nc.u8 {{.*}}[[r2:%.+]], [%r{{.+}}];
+; CHECK: cvt.u32.u16 {{.*}}%r{{.+}}, [[r2]];
+define void @__omptgt__0_1045bf9_30_(i8* noalias dereferenceable(1), i32* noalias nocapture readnone dereferenceable(4), [64 x i8]* noalias nocapture readonly dereferenceable(64)) #0 {
+entry:
+  %reduction.rec.var = alloca { i8* }, align 8
+  %sum = alloca i8, align 1
+  %tmp = alloca { i32, i32, i32, i32, i8* }, align 8
+  %last = alloca i32, align 4
+  %lb = alloca i32, align 4
+  %ub = alloca i32, align 4
+  %st = alloca i32, align 4
+  %tmp12 = alloca { i32, i32, i32, i32, i8* }, align 8
+  %tmp22 = alloca { i32, i32, i32, i32, i8* }, align 8
+  %tmp28 = alloca { i32, i32, i32, i32, i8* }, align 8
+  %tmp33 = alloca { i32, i32, i32, i32, i8* }, align 8
+  store i32 1, i32* %st, align 4
+  store i32 1, i32* %last, align 4
+  store i8 0, i8* %sum, align 1
+  store [2 x i32] zeroinitializer, [2 x i32] addrspace(3)* @__omptgt__ControlState, align 4
+  store i32 0, i32 addrspace(3)* @__omptgt__CudaThreadsInParallel, align 4
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %IsTeamMaster = icmp eq i32 %3, 0
+  br i1 %IsTeamMaster, label %.master.init., label %.switch..lr.ph
+
+.master.init.:                                    ; preds = %entry
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  store i32 %4, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4
+  br label %.switch..lr.ph
+
+.switch..lr.ph:                                   ; preds = %entry, %.master.init.
+  tail call void @llvm.nvvm.barrier0()
+  %5 = load i32, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4
+  %6 = add i32 %5, -1
+  %7 = and i32 %6, %3
+  %_ZZ15test_fixed8_addvE3sum.addr = getelementptr { i8* }, { i8* }* %reduction.rec.var, i64 0, i32 0
+  %"(void*)reductionrec17" = bitcast { i8* }* %reduction.rec.var to i8*
+  br label %.switch.
+
+.switch.:                                         ; preds = %.switch..lr.ph, %.sync.and.next.state
+  %8 = phi i32 [ %7, %.switch..lr.ph ], [ %10, %.sync.and.next.state ]
+  %ControlStateIndex.021 = phi i32 [ 0, %.switch..lr.ph ], [ %14, %.sync.and.next.state ]
+  %NextState.020 = phi i32 [ 0, %.switch..lr.ph ], [ %13, %.sync.and.next.state ]
+  switch i32 %NextState.020, label %.sync.and.next.state [
+    i32 0, label %.seq.start.check
+    i32 1, label %.finished.case.
+    i32 2, label %.par.reg.pre
+    i32 3, label %after.barrier.check.
+    i32 4, label %.seq.reg.pre
+  ]
+
+.end.target:                                      ; preds = %.sync.and.next.state
+  ret void
+
+.seq.start.check:                                 ; preds = %.switch.
+  br i1 %IsTeamMaster, label %.first.seq., label %.sync.and.next.state
+
+.sync.and.next.state:                             ; preds = %.par.reg.pre, %omp.loop.end, %.if.is.parthread.or.lane, %reduction.continue, %after.barrier.check., %.seq.reg.pre, %.switch., %.master.only.seq.region, %.master.only.next.label38, %.master.only.next.label, %.first.seq., %.seq.start.check, %.finished.case.
+  %9 = phi i1 [ false, %.switch. ], [ false, %.seq.reg.pre ], [ false, %.master.only.seq.region ], [ false, %after.barrier.check. ], [ false, %reduction.continue ], [ false, %.master.only.next.label38 ], [ false, %.if.is.parthread.or.lane ], [ false, %omp.loop.end ], [ false, %.master.only.next.label ], [ true, %.finished.case. ], [ false, %.first.seq. ], [ false, %.seq.start.check ], [ false, %.par.reg.pre ]
+  %10 = phi i32 [ %8, %.switch. ], [ %8, %.seq.reg.pre ], [ %8, %.master.only.seq.region ], [ %8, %after.barrier.check. ], [ 0, %reduction.continue ], [ 0, %.master.only.next.label38 ], [ %24, %.if.is.parthread.or.lane ], [ 0, %omp.loop.end ], [ 0, %.master.only.next.label ], [ %8, %.finished.case. ], [ %8, %.first.seq. ], [ %8, %.seq.start.check ], [ %22, %.par.reg.pre ]
+  call void @llvm.nvvm.barrier0()
+  %11 = sext i32 %ControlStateIndex.021 to i64
+  %12 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %11
+  %13 = load i32, i32 addrspace(3)* %12, align 4
+  %14 = xor i32 %ControlStateIndex.021, 1
+  br i1 %9, label %.end.target, label %.switch.
+
+.finished.case.:                                  ; preds = %.switch.
+  br label %.sync.and.next.state
+
+.first.seq.:                                      ; preds = %.seq.start.check
+  %15 = load i32, i32* @__omptgt__0_1045bf9_30__thread_limit, align 4
+  call void @__kmpc_kernel_init(i32 %15) #3
+  store i32 1, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4
+  %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %17 = call i32 @__kmpc_kernel_prepare_parallel(i32 %16, i32 1) #3
+  store i32 %17, i32 addrspace(3)* @__omptgt__CudaThreadsInParallel, align 4
+  %18 = sext i32 %ControlStateIndex.021 to i64
+  %19 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %18
+  store i32 2, i32 addrspace(3)* %19, align 4
+  br label %.sync.and.next.state
+
+.par.reg.pre:                                     ; preds = %.switch.
+  %20 = load i32, i32 addrspace(3)* @__omptgt__CudaThreadsInParallel, align 4
+  %21 = icmp slt i32 %3, %20
+  %22 = load i32, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4
+  br i1 %21, label %.if.is.parthread.or.lane, label %.sync.and.next.state
+
+.if.is.parthread.or.lane:                         ; preds = %.par.reg.pre
+  %23 = add i32 %22, -1
+  %24 = and i32 %23, %3
+  call void @__kmpc_kernel_parallel(i32 %22) #3
+  %25 = icmp eq i32 %24, 0
+  br i1 %25, label %.par.reg.code, label %.sync.and.next.state
+
+.par.reg.code:                                    ; preds = %.if.is.parthread.or.lane
+  call void @llvm.lifetime.start(i64 1, i8* nonnull %sum) #3
+  store i8* %sum, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8
+  %blockid = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %blocksize = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %26 = mul i32 %blocksize, %blockid
+  %gid = add i32 %26, %3
+  store i32 0, i32* %lb, align 4
+  store i32 63, i32* %ub, align 4
+  call void @__kmpc_for_static_init_4({ i32, i32, i32, i32, i8* }* nonnull %tmp, i32 %gid, i32 33, i32* nonnull %last, i32* nonnull %lb, i32* nonnull %ub, i32* nonnull %st, i32 1, i32 2) #3
+  %27 = load i32, i32* %lb, align 4
+  %28 = load i32, i32* %ub, align 4
+  %29 = icmp slt i32 %28, 63
+  %30 = select i1 %29, i32 %28, i32 63
+  store i32 %30, i32* %ub, align 4
+  %31 = icmp slt i32 %27, 64
+  br i1 %31, label %omp.lb.le.global_ub..lr.ph.lr.ph, label %omp.loop.end
+
+omp.lb.le.global_ub..lr.ph.lr.ph:                 ; preds = %.par.reg.code
+  %32 = load i32, i32* %st, align 4
+  br label %omp.lb.le.global_ub..lr.ph
+
+omp.lb.le.global_ub..lr.ph:                       ; preds = %omp.lb.le.global_ub..lr.ph.lr.ph, %omp.loop.fini
+  %33 = phi i32 [ %30, %omp.lb.le.global_ub..lr.ph.lr.ph ], [ %39, %omp.loop.fini ]
+  %.pr = phi i32 [ %27, %omp.lb.le.global_ub..lr.ph.lr.ph ], [ %36, %omp.loop.fini ]
+  %34 = icmp slt i32 %.pr, 64
+  %omp.idx.le.ub.us.31 = icmp sgt i32 %.pr, %33
+  br i1 %34, label %omp.lb.le.global_ub..us.preheader, label %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge
+
+omp.lb.le.global_ub..us.preheader:                ; preds = %omp.lb.le.global_ub..lr.ph
+  br i1 %omp.idx.le.ub.us.31, label %omp.loop.fini, label %omp.lb_ub.check_pass.us.preheader
+
+omp.lb_ub.check_pass.us.preheader:                ; preds = %omp.lb.le.global_ub..us.preheader
+  %sum.promoted = load i8, i8* %sum, align 1, !tbaa !2
+  br label %omp.lb_ub.check_pass.us
+
+omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge: ; preds = %omp.lb.le.global_ub..lr.ph
+  br i1 %omp.idx.le.ub.us.31, label %omp.loop.fini, label %omp.loop.main.omp.loop.end_crit_edge
+
+omp.lb_ub.check_pass.us:                          ; preds = %omp.lb_ub.check_pass.us.preheader, %omp.lb_ub.check_pass.us
+  %conv6.us35 = phi i8 [ %conv6.us, %omp.lb_ub.check_pass.us ], [ %sum.promoted, %omp.lb_ub.check_pass.us.preheader ]
+  %.idx..018.us32 = phi i32 [ %.next.idx..us, %omp.lb_ub.check_pass.us ], [ %.pr, %omp.lb_ub.check_pass.us.preheader ]
+  %conv.15.us = zext i8 %conv6.us35 to i32
+  %idxprom.us = sext i32 %.idx..018.us32 to i64
+  %arrayidx.us = getelementptr inbounds [64 x i8], [64 x i8]* %2, i64 0, i64 %idxprom.us
+  %35 = load i8, i8* %arrayidx.us, align 1, !tbaa !2
+  %conv4.16.us = zext i8 %35 to i32
+  %add5.us = add nuw nsw i32 %conv4.16.us, %conv.15.us
+  %conv6.us = trunc i32 %add5.us to i8
+  %.next.idx..us = add nsw i32 %.idx..018.us32, 1
+  %omp.idx.le.ub.us = icmp slt i32 %.idx..018.us32, %33
+  br i1 %omp.idx.le.ub.us, label %omp.lb_ub.check_pass.us, label %omp.loop.fini.loopexit
+
+omp.loop.fini.loopexit:                           ; preds = %omp.lb_ub.check_pass.us
+  %conv6.us.lcssa = phi i8 [ %conv6.us, %omp.lb_ub.check_pass.us ]
+  store i8 %conv6.us.lcssa, i8* %sum, align 1, !tbaa !2
+  br label %omp.loop.fini
+
+omp.loop.fini:                                    ; preds = %omp.loop.fini.loopexit, %omp.lb.le.global_ub..us.preheader, %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge
+  %36 = add i32 %32, %.pr
+  store i32 %36, i32* %lb, align 4
+  %37 = add i32 %33, %32
+  %38 = icmp slt i32 %37, 63
+  %39 = select i1 %38, i32 %37, i32 63
+  store i32 %39, i32* %ub, align 4
+  %40 = icmp slt i32 %36, 64
+  br i1 %40, label %omp.lb.le.global_ub..lr.ph, label %omp.loop.end.loopexit
+
+omp.loop.main.omp.loop.end_crit_edge:             ; preds = %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge
+  %.pr.lcssa = phi i32 [ %.pr, %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge ]
+  %41 = load i8, i8* %sum, align 1, !tbaa !2
+  %conv.15 = zext i8 %41 to i32
+  %idxprom = sext i32 %.pr.lcssa to i64
+  %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* %2, i64 0, i64 %idxprom
+  %42 = load i8, i8* %arrayidx, align 1, !tbaa !2
+  %conv4.16 = zext i8 %42 to i32
+  %add5 = add nuw nsw i32 %conv4.16, %conv.15
+  %conv6 = trunc i32 %add5 to i8
+  store i8 %conv6, i8* %sum, align 1, !tbaa !2
+  br label %omp.loop.end
+
+omp.loop.end.loopexit:                            ; preds = %omp.loop.fini
+  br label %omp.loop.end
+
+omp.loop.end:                                     ; preds = %omp.loop.end.loopexit, %.par.reg.code, %omp.loop.main.omp.loop.end_crit_edge
+  call void @__kmpc_for_static_fini({ i32, i32, i32, i32, i8* }* nonnull %tmp, i32 %gid) #3
+  br i1 %IsTeamMaster, label %.master.only.next.label, label %.sync.and.next.state
+
+after.barrier.check.:                             ; preds = %.switch.
+  %43 = icmp eq i32 %8, 0
+  br i1 %43, label %after.barrier.codegen., label %.sync.and.next.state
+
+.master.only.next.label:                          ; preds = %omp.loop.end
+  %44 = sext i32 %ControlStateIndex.021 to i64
+  %45 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %44
+  store i32 3, i32 addrspace(3)* %45, align 4
+  br label %.sync.and.next.state
+
+after.barrier.codegen.:                           ; preds = %after.barrier.check.
+  %46 = call i32 @__gpu_block_reduce() #3
+  %47 = icmp eq i32 %46, 1
+  br i1 %47, label %gpu.block.then, label %gpu.block.end
+
+gpu.block.then:                                   ; preds = %after.barrier.codegen.
+  %_ZZ15test_fixed8_addvE3sum.rhs.i = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8
+  %48 = load i8, i8* %_ZZ15test_fixed8_addvE3sum.rhs.i, align 1
+  %49 = call i8 @__gpu_warpBlockRedu_fixed1_add(i8 %48) #3
+  %_ZZ15test_fixed8_addvE3sum.lhs.i = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8
+  store i8 %49, i8* %_ZZ15test_fixed8_addvE3sum.lhs.i, align 1
+  br label %gpu.block.end
+
+gpu.block.end:                                    ; preds = %gpu.block.then, %after.barrier.codegen.
+  %blockid13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %blocksize14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %50 = mul i32 %blocksize14, %blockid13
+  %gid16 = add i32 %50, %3
+  %51 = call i32 @__kmpc_reduce({ i32, i32, i32, i32, i8* }* nonnull %tmp12, i32 %gid16, i32 1, i64 8, i8* %"(void*)reductionrec17", void (i8*, i8*)* nonnull @omp_reduction_op, [8 x i32]* nonnull @.lck.) #3
+  switch i32 %51, label %reduction.continue [
+    i32 1, label %reduction.case1
+    i32 2, label %reduction.case2
+  ]
+
+reduction.case1:                                  ; preds = %gpu.block.end
+  %52 = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8, !tbaa !5
+  %53 = load i8, i8* %52, align 1, !tbaa !2
+  %conv18.14 = zext i8 %53 to i32
+  %54 = load i8, i8* %0, align 1, !tbaa !2
+  %conv19.13 = zext i8 %54 to i32
+  %add20 = add nuw nsw i32 %conv19.13, %conv18.14
+  %conv21 = trunc i32 %add20 to i8
+  store i8 %conv21, i8* %0, align 1, !tbaa !2
+  call void @__kmpc_end_reduce({ i32, i32, i32, i32, i8* }* nonnull %tmp28, i32 %gid16, [8 x i32]* nonnull @.lck.) #3
+  br label %reduction.continue
+
+reduction.case2:                                  ; preds = %gpu.block.end
+  %_ZZ15test_fixed8_addvE3sum.rhs = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8
+  %55 = load i8, i8* %_ZZ15test_fixed8_addvE3sum.rhs, align 1
+  call void @__kmpc_atomic_fixed1_add({ i32, i32, i32, i32, i8* }* nonnull %tmp22, i32 %gid16, i8* nonnull %0, i8 %55) #3
+  call void @__kmpc_end_reduce({ i32, i32, i32, i32, i8* }* nonnull %tmp33, i32 %gid16, [8 x i32]* nonnull @.lck.) #3
+  br label %reduction.continue
+
+reduction.continue:                               ; preds = %reduction.case2, %reduction.case1, %gpu.block.end
+  call void @llvm.lifetime.end(i64 1, i8* nonnull %sum) #3
+  call void @__kmpc_kernel_end_parallel() #3
+  br i1 %IsTeamMaster, label %.master.only.next.label38, label %.sync.and.next.state
+
+.seq.reg.pre:                                     ; preds = %.switch.
+  br i1 %IsTeamMaster, label %.master.only.seq.region, label %.sync.and.next.state
+
+.master.only.next.label38:                        ; preds = %reduction.continue
+  %56 = sext i32 %ControlStateIndex.021 to i64
+  %57 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %56
+  store i32 4, i32 addrspace(3)* %57, align 4
+  br label %.sync.and.next.state
+
+.master.only.seq.region:                          ; preds = %.seq.reg.pre
+  %58 = sext i32 %ControlStateIndex.021 to i64
+  %59 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %58
+  store i32 1, i32 addrspace(3)* %59, align 4
+  br label %.sync.and.next.state
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+declare void @__kmpc_kernel_init(i32)
+
+declare i32 @__kmpc_kernel_prepare_parallel(i32, i32)
+
+declare void @__kmpc_kernel_parallel(i32)
+
+; Function Attrs: nounwind
+define void @omp_reduction_op(i8* nocapture readonly, i8* nocapture readonly) #0 {
+entry:
+  %_ZZ15test_fixed8_addvE3sum.addr.lhs = bitcast i8* %0 to i8**
+  %_ZZ15test_fixed8_addvE3sum.addr.rhs = bitcast i8* %1 to i8**
+  %_ZZ15test_fixed8_addvE3sum.rhs = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr.rhs, align 8
+  %2 = load i8, i8* %_ZZ15test_fixed8_addvE3sum.rhs, align 1
+  %3 = tail call i8 @__gpu_warpBlockRedu_fixed1_add(i8 %2) #3
+  %_ZZ15test_fixed8_addvE3sum.lhs = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr.lhs, align 8
+  store i8 %3, i8* %_ZZ15test_fixed8_addvE3sum.lhs, align 1
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+declare void @__kmpc_for_static_init_4({ i32, i32, i32, i32, i8* }*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini({ i32, i32, i32, i32, i8* }*, i32)
+
+declare i32 @__gpu_block_reduce()
+
+declare i32 @__kmpc_reduce({ i32, i32, i32, i32, i8* }*, i32, i32, i64, i8*, void (i8*, i8*)*, [8 x i32]*)
+
+declare void @__kmpc_atomic_fixed1_add({ i32, i32, i32, i32, i8* }*, i32, i8*, i8)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+declare void @__kmpc_end_reduce({ i32, i32, i32, i32, i8* }*, i32, [8 x i32]*)
+
+declare i8 @__gpu_warpBlockRedu_fixed1_add(i8)
+
+declare void @__kmpc_kernel_end_parallel()
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
+attributes #3 = { nounwind }
+
+!nvvm.annotations = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{void (i8*, i32*, [64 x i8]*)* @__omptgt__0_1045bf9_30_, !"kernel", i32 1}
+!1 = !{!"clang version 3.8.0 (https://github.com/clang-omp/clang.git 6f81551cdbd69865d8e0630a7115807ab4be1af6) (https://github.com/clang-omp/llvm.git aff26825b555d13ccc9dc1e967f8d1437ec6b8b2)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !3, i64 0}
+