Index: lib/Target/NVPTX/NVPTXInstrInfo.cpp =================================================================== --- lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -37,8 +37,19 @@ const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - if (DestRC->getSize() != SrcRC->getSize()) + if (DestRC->getSize() != SrcRC->getSize()) { + // If the sizes differ it may be possible we are copying a i16 to a i32 + // register. + if (DestRC == &NVPTX::Int32RegsRegClass && + SrcRC == &NVPTX::Int16RegsRegClass) { + MBB.dump(); + BuildMI(MBB, I, DL, get(NVPTX::CVT_u32_u16), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); + return; + } report_fatal_error("Copy one register into another with a different width"); + } unsigned Op; if (DestRC == &NVPTX::Int1RegsRegClass) { Index: test/CodeGen/NVPTX/reg-copy-int.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/reg-copy-int.ll @@ -0,0 +1,341 @@ +; RUN: llc < %s -O3 -march=nvptx64 -mcpu=sm_35 | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +@__omptgt__ControlState = common addrspace(3) global [2 x i32] zeroinitializer +@__omptgt__CudaThreadsInParallel = common addrspace(3) global i32 0 +@__omptgt__SimdNumLanes = common addrspace(3) global i32 0 +@__omptgt__0_1045bf9_30__thread_limit = global i32 0 +@.lck. = private global [8 x i32] zeroinitializer +@__omptgt__0_1045bf9_30__simd_info = constant i8 1 +@__omptgt__shared_data_ = common addrspace(3) global [0 x i8] zeroinitializer + +; CHECK-LABEL __omptgt__0_1045bf9_30_( +; CHECK: ld.global.nc.u8 {{.*}}[[r1:%.+]], [%r{{.+}}]; +; CHECK: cvt.u32.u16 {{.*}}%r{{.+}}, [[r1]]; +; CHECK: ld.global.nc.u8 {{.*}}[[r2:%.+]], [%r{{.+}}]; +; CHECK: cvt.u32.u16 {{.*}}%r{{.+}}, [[r2]]; +define void @__omptgt__0_1045bf9_30_(i8* noalias dereferenceable(1), i32* noalias nocapture readnone dereferenceable(4), [64 x i8]* noalias nocapture readonly dereferenceable(64)) #0 { +entry: + %reduction.rec.var = alloca { i8* }, align 8 + %sum = alloca i8, align 1 + %tmp = alloca { i32, i32, i32, i32, i8* }, align 8 + %last = alloca i32, align 4 + %lb = alloca i32, align 4 + %ub = alloca i32, align 4 + %st = alloca i32, align 4 + %tmp12 = alloca { i32, i32, i32, i32, i8* }, align 8 + %tmp22 = alloca { i32, i32, i32, i32, i8* }, align 8 + %tmp28 = alloca { i32, i32, i32, i32, i8* }, align 8 + %tmp33 = alloca { i32, i32, i32, i32, i8* }, align 8 + store i32 1, i32* %st, align 4 + store i32 1, i32* %last, align 4 + store i8 0, i8* %sum, align 1 + store [2 x i32] zeroinitializer, [2 x i32] addrspace(3)* @__omptgt__ControlState, align 4 + store i32 0, i32 addrspace(3)* @__omptgt__CudaThreadsInParallel, align 4 + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %IsTeamMaster = icmp eq i32 %3, 0 + br i1 %IsTeamMaster, label %.master.init., label %.switch..lr.ph + +.master.init.: ; preds = %entry + %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + store i32 %4, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4 + br label %.switch..lr.ph + +.switch..lr.ph: ; preds = %entry, %.master.init. + tail call void @llvm.nvvm.barrier0() + %5 = load i32, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4 + %6 = add i32 %5, -1 + %7 = and i32 %6, %3 + %_ZZ15test_fixed8_addvE3sum.addr = getelementptr { i8* }, { i8* }* %reduction.rec.var, i64 0, i32 0 + %"(void*)reductionrec17" = bitcast { i8* }* %reduction.rec.var to i8* + br label %.switch. + +.switch.: ; preds = %.switch..lr.ph, %.sync.and.next.state + %8 = phi i32 [ %7, %.switch..lr.ph ], [ %10, %.sync.and.next.state ] + %ControlStateIndex.021 = phi i32 [ 0, %.switch..lr.ph ], [ %14, %.sync.and.next.state ] + %NextState.020 = phi i32 [ 0, %.switch..lr.ph ], [ %13, %.sync.and.next.state ] + switch i32 %NextState.020, label %.sync.and.next.state [ + i32 0, label %.seq.start.check + i32 1, label %.finished.case. + i32 2, label %.par.reg.pre + i32 3, label %after.barrier.check. + i32 4, label %.seq.reg.pre + ] + +.end.target: ; preds = %.sync.and.next.state + ret void + +.seq.start.check: ; preds = %.switch. + br i1 %IsTeamMaster, label %.first.seq., label %.sync.and.next.state + +.sync.and.next.state: ; preds = %.par.reg.pre, %omp.loop.end, %.if.is.parthread.or.lane, %reduction.continue, %after.barrier.check., %.seq.reg.pre, %.switch., %.master.only.seq.region, %.master.only.next.label38, %.master.only.next.label, %.first.seq., %.seq.start.check, %.finished.case. + %9 = phi i1 [ false, %.switch. ], [ false, %.seq.reg.pre ], [ false, %.master.only.seq.region ], [ false, %after.barrier.check. ], [ false, %reduction.continue ], [ false, %.master.only.next.label38 ], [ false, %.if.is.parthread.or.lane ], [ false, %omp.loop.end ], [ false, %.master.only.next.label ], [ true, %.finished.case. ], [ false, %.first.seq. ], [ false, %.seq.start.check ], [ false, %.par.reg.pre ] + %10 = phi i32 [ %8, %.switch. ], [ %8, %.seq.reg.pre ], [ %8, %.master.only.seq.region ], [ %8, %after.barrier.check. ], [ 0, %reduction.continue ], [ 0, %.master.only.next.label38 ], [ %24, %.if.is.parthread.or.lane ], [ 0, %omp.loop.end ], [ 0, %.master.only.next.label ], [ %8, %.finished.case. ], [ %8, %.first.seq. ], [ %8, %.seq.start.check ], [ %22, %.par.reg.pre ] + call void @llvm.nvvm.barrier0() + %11 = sext i32 %ControlStateIndex.021 to i64 + %12 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %11 + %13 = load i32, i32 addrspace(3)* %12, align 4 + %14 = xor i32 %ControlStateIndex.021, 1 + br i1 %9, label %.end.target, label %.switch. + +.finished.case.: ; preds = %.switch. + br label %.sync.and.next.state + +.first.seq.: ; preds = %.seq.start.check + %15 = load i32, i32* @__omptgt__0_1045bf9_30__thread_limit, align 4 + call void @__kmpc_kernel_init(i32 %15) #3 + store i32 1, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %17 = call i32 @__kmpc_kernel_prepare_parallel(i32 %16, i32 1) #3 + store i32 %17, i32 addrspace(3)* @__omptgt__CudaThreadsInParallel, align 4 + %18 = sext i32 %ControlStateIndex.021 to i64 + %19 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %18 + store i32 2, i32 addrspace(3)* %19, align 4 + br label %.sync.and.next.state + +.par.reg.pre: ; preds = %.switch. + %20 = load i32, i32 addrspace(3)* @__omptgt__CudaThreadsInParallel, align 4 + %21 = icmp slt i32 %3, %20 + %22 = load i32, i32 addrspace(3)* @__omptgt__SimdNumLanes, align 4 + br i1 %21, label %.if.is.parthread.or.lane, label %.sync.and.next.state + +.if.is.parthread.or.lane: ; preds = %.par.reg.pre + %23 = add i32 %22, -1 + %24 = and i32 %23, %3 + call void @__kmpc_kernel_parallel(i32 %22) #3 + %25 = icmp eq i32 %24, 0 + br i1 %25, label %.par.reg.code, label %.sync.and.next.state + +.par.reg.code: ; preds = %.if.is.parthread.or.lane + call void @llvm.lifetime.start(i64 1, i8* nonnull %sum) #3 + store i8* %sum, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8 + %blockid = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %blocksize = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %26 = mul i32 %blocksize, %blockid + %gid = add i32 %26, %3 + store i32 0, i32* %lb, align 4 + store i32 63, i32* %ub, align 4 + call void @__kmpc_for_static_init_4({ i32, i32, i32, i32, i8* }* nonnull %tmp, i32 %gid, i32 33, i32* nonnull %last, i32* nonnull %lb, i32* nonnull %ub, i32* nonnull %st, i32 1, i32 2) #3 + %27 = load i32, i32* %lb, align 4 + %28 = load i32, i32* %ub, align 4 + %29 = icmp slt i32 %28, 63 + %30 = select i1 %29, i32 %28, i32 63 + store i32 %30, i32* %ub, align 4 + %31 = icmp slt i32 %27, 64 + br i1 %31, label %omp.lb.le.global_ub..lr.ph.lr.ph, label %omp.loop.end + +omp.lb.le.global_ub..lr.ph.lr.ph: ; preds = %.par.reg.code + %32 = load i32, i32* %st, align 4 + br label %omp.lb.le.global_ub..lr.ph + +omp.lb.le.global_ub..lr.ph: ; preds = %omp.lb.le.global_ub..lr.ph.lr.ph, %omp.loop.fini + %33 = phi i32 [ %30, %omp.lb.le.global_ub..lr.ph.lr.ph ], [ %39, %omp.loop.fini ] + %.pr = phi i32 [ %27, %omp.lb.le.global_ub..lr.ph.lr.ph ], [ %36, %omp.loop.fini ] + %34 = icmp slt i32 %.pr, 64 + %omp.idx.le.ub.us.31 = icmp sgt i32 %.pr, %33 + br i1 %34, label %omp.lb.le.global_ub..us.preheader, label %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge + +omp.lb.le.global_ub..us.preheader: ; preds = %omp.lb.le.global_ub..lr.ph + br i1 %omp.idx.le.ub.us.31, label %omp.loop.fini, label %omp.lb_ub.check_pass.us.preheader + +omp.lb_ub.check_pass.us.preheader: ; preds = %omp.lb.le.global_ub..us.preheader + %sum.promoted = load i8, i8* %sum, align 1, !tbaa !2 + br label %omp.lb_ub.check_pass.us + +omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge: ; preds = %omp.lb.le.global_ub..lr.ph + br i1 %omp.idx.le.ub.us.31, label %omp.loop.fini, label %omp.loop.main.omp.loop.end_crit_edge + +omp.lb_ub.check_pass.us: ; preds = %omp.lb_ub.check_pass.us.preheader, %omp.lb_ub.check_pass.us + %conv6.us35 = phi i8 [ %conv6.us, %omp.lb_ub.check_pass.us ], [ %sum.promoted, %omp.lb_ub.check_pass.us.preheader ] + %.idx..018.us32 = phi i32 [ %.next.idx..us, %omp.lb_ub.check_pass.us ], [ %.pr, %omp.lb_ub.check_pass.us.preheader ] + %conv.15.us = zext i8 %conv6.us35 to i32 + %idxprom.us = sext i32 %.idx..018.us32 to i64 + %arrayidx.us = getelementptr inbounds [64 x i8], [64 x i8]* %2, i64 0, i64 %idxprom.us + %35 = load i8, i8* %arrayidx.us, align 1, !tbaa !2 + %conv4.16.us = zext i8 %35 to i32 + %add5.us = add nuw nsw i32 %conv4.16.us, %conv.15.us + %conv6.us = trunc i32 %add5.us to i8 + %.next.idx..us = add nsw i32 %.idx..018.us32, 1 + %omp.idx.le.ub.us = icmp slt i32 %.idx..018.us32, %33 + br i1 %omp.idx.le.ub.us, label %omp.lb_ub.check_pass.us, label %omp.loop.fini.loopexit + +omp.loop.fini.loopexit: ; preds = %omp.lb_ub.check_pass.us + %conv6.us.lcssa = phi i8 [ %conv6.us, %omp.lb_ub.check_pass.us ] + store i8 %conv6.us.lcssa, i8* %sum, align 1, !tbaa !2 + br label %omp.loop.fini + +omp.loop.fini: ; preds = %omp.loop.fini.loopexit, %omp.lb.le.global_ub..us.preheader, %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge + %36 = add i32 %32, %.pr + store i32 %36, i32* %lb, align 4 + %37 = add i32 %33, %32 + %38 = icmp slt i32 %37, 63 + %39 = select i1 %38, i32 %37, i32 63 + store i32 %39, i32* %ub, align 4 + %40 = icmp slt i32 %36, 64 + br i1 %40, label %omp.lb.le.global_ub..lr.ph, label %omp.loop.end.loopexit + +omp.loop.main.omp.loop.end_crit_edge: ; preds = %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge + %.pr.lcssa = phi i32 [ %.pr, %omp.lb.le.global_ub..lr.ph.omp.lb.le.global_ub..lr.ph.split_crit_edge ] + %41 = load i8, i8* %sum, align 1, !tbaa !2 + %conv.15 = zext i8 %41 to i32 + %idxprom = sext i32 %.pr.lcssa to i64 + %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* %2, i64 0, i64 %idxprom + %42 = load i8, i8* %arrayidx, align 1, !tbaa !2 + %conv4.16 = zext i8 %42 to i32 + %add5 = add nuw nsw i32 %conv4.16, %conv.15 + %conv6 = trunc i32 %add5 to i8 + store i8 %conv6, i8* %sum, align 1, !tbaa !2 + br label %omp.loop.end + +omp.loop.end.loopexit: ; preds = %omp.loop.fini + br label %omp.loop.end + +omp.loop.end: ; preds = %omp.loop.end.loopexit, %.par.reg.code, %omp.loop.main.omp.loop.end_crit_edge + call void @__kmpc_for_static_fini({ i32, i32, i32, i32, i8* }* nonnull %tmp, i32 %gid) #3 + br i1 %IsTeamMaster, label %.master.only.next.label, label %.sync.and.next.state + +after.barrier.check.: ; preds = %.switch. + %43 = icmp eq i32 %8, 0 + br i1 %43, label %after.barrier.codegen., label %.sync.and.next.state + +.master.only.next.label: ; preds = %omp.loop.end + %44 = sext i32 %ControlStateIndex.021 to i64 + %45 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %44 + store i32 3, i32 addrspace(3)* %45, align 4 + br label %.sync.and.next.state + +after.barrier.codegen.: ; preds = %after.barrier.check. + %46 = call i32 @__gpu_block_reduce() #3 + %47 = icmp eq i32 %46, 1 + br i1 %47, label %gpu.block.then, label %gpu.block.end + +gpu.block.then: ; preds = %after.barrier.codegen. + %_ZZ15test_fixed8_addvE3sum.rhs.i = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8 + %48 = load i8, i8* %_ZZ15test_fixed8_addvE3sum.rhs.i, align 1 + %49 = call i8 @__gpu_warpBlockRedu_fixed1_add(i8 %48) #3 + %_ZZ15test_fixed8_addvE3sum.lhs.i = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8 + store i8 %49, i8* %_ZZ15test_fixed8_addvE3sum.lhs.i, align 1 + br label %gpu.block.end + +gpu.block.end: ; preds = %gpu.block.then, %after.barrier.codegen. + %blockid13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %blocksize14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %50 = mul i32 %blocksize14, %blockid13 + %gid16 = add i32 %50, %3 + %51 = call i32 @__kmpc_reduce({ i32, i32, i32, i32, i8* }* nonnull %tmp12, i32 %gid16, i32 1, i64 8, i8* %"(void*)reductionrec17", void (i8*, i8*)* nonnull @omp_reduction_op, [8 x i32]* nonnull @.lck.) #3 + switch i32 %51, label %reduction.continue [ + i32 1, label %reduction.case1 + i32 2, label %reduction.case2 + ] + +reduction.case1: ; preds = %gpu.block.end + %52 = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8, !tbaa !5 + %53 = load i8, i8* %52, align 1, !tbaa !2 + %conv18.14 = zext i8 %53 to i32 + %54 = load i8, i8* %0, align 1, !tbaa !2 + %conv19.13 = zext i8 %54 to i32 + %add20 = add nuw nsw i32 %conv19.13, %conv18.14 + %conv21 = trunc i32 %add20 to i8 + store i8 %conv21, i8* %0, align 1, !tbaa !2 + call void @__kmpc_end_reduce({ i32, i32, i32, i32, i8* }* nonnull %tmp28, i32 %gid16, [8 x i32]* nonnull @.lck.) #3 + br label %reduction.continue + +reduction.case2: ; preds = %gpu.block.end + %_ZZ15test_fixed8_addvE3sum.rhs = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr, align 8 + %55 = load i8, i8* %_ZZ15test_fixed8_addvE3sum.rhs, align 1 + call void @__kmpc_atomic_fixed1_add({ i32, i32, i32, i32, i8* }* nonnull %tmp22, i32 %gid16, i8* nonnull %0, i8 %55) #3 + call void @__kmpc_end_reduce({ i32, i32, i32, i32, i8* }* nonnull %tmp33, i32 %gid16, [8 x i32]* nonnull @.lck.) #3 + br label %reduction.continue + +reduction.continue: ; preds = %reduction.case2, %reduction.case1, %gpu.block.end + call void @llvm.lifetime.end(i64 1, i8* nonnull %sum) #3 + call void @__kmpc_kernel_end_parallel() #3 + br i1 %IsTeamMaster, label %.master.only.next.label38, label %.sync.and.next.state + +.seq.reg.pre: ; preds = %.switch. + br i1 %IsTeamMaster, label %.master.only.seq.region, label %.sync.and.next.state + +.master.only.next.label38: ; preds = %reduction.continue + %56 = sext i32 %ControlStateIndex.021 to i64 + %57 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %56 + store i32 4, i32 addrspace(3)* %57, align 4 + br label %.sync.and.next.state + +.master.only.seq.region: ; preds = %.seq.reg.pre + %58 = sext i32 %ControlStateIndex.021 to i64 + %59 = getelementptr [2 x i32], [2 x i32] addrspace(3)* @__omptgt__ControlState, i64 0, i64 %58 + store i32 1, i32 addrspace(3)* %59, align 4 + br label %.sync.and.next.state +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.nvvm.barrier0() #2 + +declare void @__kmpc_kernel_init(i32) + +declare i32 @__kmpc_kernel_prepare_parallel(i32, i32) + +declare void @__kmpc_kernel_parallel(i32) + +; Function Attrs: nounwind +define void @omp_reduction_op(i8* nocapture readonly, i8* nocapture readonly) #0 { +entry: + %_ZZ15test_fixed8_addvE3sum.addr.lhs = bitcast i8* %0 to i8** + %_ZZ15test_fixed8_addvE3sum.addr.rhs = bitcast i8* %1 to i8** + %_ZZ15test_fixed8_addvE3sum.rhs = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr.rhs, align 8 + %2 = load i8, i8* %_ZZ15test_fixed8_addvE3sum.rhs, align 1 + %3 = tail call i8 @__gpu_warpBlockRedu_fixed1_add(i8 %2) #3 + %_ZZ15test_fixed8_addvE3sum.lhs = load i8*, i8** %_ZZ15test_fixed8_addvE3sum.addr.lhs, align 8 + store i8 %3, i8* %_ZZ15test_fixed8_addvE3sum.lhs, align 1 + ret void +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #3 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +declare void @__kmpc_for_static_init_4({ i32, i32, i32, i32, i8* }*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini({ i32, i32, i32, i32, i8* }*, i32) + +declare i32 @__gpu_block_reduce() + +declare i32 @__kmpc_reduce({ i32, i32, i32, i32, i8* }*, i32, i32, i64, i8*, void (i8*, i8*)*, [8 x i32]*) + +declare void @__kmpc_atomic_fixed1_add({ i32, i32, i32, i32, i8* }*, i32, i8*, i8) + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #3 + +declare void @__kmpc_end_reduce({ i32, i32, i32, i32, i8* }*, i32, [8 x i32]*) + +declare i8 @__gpu_warpBlockRedu_fixed1_add(i8) + +declare void @__kmpc_kernel_end_parallel() + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } +attributes #3 = { nounwind } + +!nvvm.annotations = !{!0} +!llvm.ident = !{!1} + +!0 = !{void (i8*, i32*, [64 x i8]*)* @__omptgt__0_1045bf9_30_, !"kernel", i32 1} +!1 = !{!"clang version 3.8.0 (https://github.com/clang-omp/clang.git 6f81551cdbd69865d8e0630a7115807ab4be1af6) (https://github.com/clang-omp/llvm.git aff26825b555d13ccc9dc1e967f8d1437ec6b8b2)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!6, !6, i64 0} +!6 = !{!"any pointer", !3, i64 0} +