Index: lib/Analysis/ValueTracking.cpp =================================================================== --- lib/Analysis/ValueTracking.cpp +++ lib/Analysis/ValueTracking.cpp @@ -753,6 +753,29 @@ case Intrinsic::x86_sse42_crc32_64_64: KnownZero = APInt::getHighBitsSet(64, 32); break; + // Some PTX special registers are bounded per CUDA programming guide + // (http://docs.nvidia.com/cuda/cuda-c-programming-guide/ + // index.html#compute-capabilities). + // Leveraing the bounds of these special registers can lead to more + // precise value analysis. + case Intrinsic::nvvm_read_ptx_sreg_tid_x: + case Intrinsic::nvvm_read_ptx_sreg_tid_y: + // threadIdx.x, threadIdx.y < 1024 + KnownZero = APInt::getHighBitsSet(32, 32 - 10); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_z: + // threadIdx.z < 64 + KnownZero = APInt::getHighBitsSet(32, 32 - 6); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: + // blockIdx.x < 2^31 + KnownZero = APInt::getHighBitsSet(32, 32 - 31); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: + case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: + // blockIdx.y, blockIdx.z < 65536 + KnownZero = APInt::getHighBitsSet(32, 32 - 16); + break; } } break; Index: test/Transforms/InstCombine/intrinsics.ll =================================================================== --- test/Transforms/InstCombine/intrinsics.ll +++ test/Transforms/InstCombine/intrinsics.ll @@ -9,6 +9,12 @@ declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() define i8 @uaddtest1(i8 %A, i8 %B) { %x = call %overflow.result @llvm.uadd.with.overflow.i8(i8 %A, i8 %B) @@ -256,3 +262,41 @@ ; CHECK-LABEL: @cttz_select( ; CHECK: select i1 %tobool, i32 %cttz, i32 32 } + +define void @nvvm_thread_idx(i32* %output_x, i32* %output_y, i32* %output_z) { +; CHECK-LABEL: @nvvm_thread_idx( + %tid_x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %tid_y = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %tid_z = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; 0 <= threadIdx.x, threadIdx.y and threadIdx.z < 2^31. +; Therefore, add i32 threadIdx.x|y|z, 5 has no unsigned wrap. + %x = add i32 %tid_x, 5 + %y = add i32 %tid_y, 5 + %z = add i32 %tid_z, 5 +; CHECK: add nuw +; CHECK: add nuw +; CHECK: add nuw + store i32 %x, i32* %output_x + store i32 %y, i32* %output_y + store i32 %z, i32* %output_z + ret void +} + +define void @nvvm_block_idx(i32* %output_x, i32* %output_y, i32* %output_z) { +; CHECK-LABEL: @nvvm_block_idx( + %bid_x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %bid_y = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + %bid_z = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() +; 0 <= blockIdx.x, blockIdx.y and blockIdx.z < 2^31. +; Therefore, add i32 blockIdx.x|y|z, 5 has no unsigned wrap. + %x = add i32 %bid_x, 5 + %y = add i32 %bid_y, 5 + %z = add i32 %bid_z, 5 +; CHECK: add nuw +; CHECK: add nuw +; CHECK: add nuw + store i32 %x, i32* %output_x + store i32 %y, i32* %output_y + store i32 %z, i32* %output_z + ret void +}