This is an archive of the discontinued LLVM Phabricator instance.

[ValueTracking] Consider the bounds of PTX special registers
AbandonedPublic

Authored by jingyue on Jun 15 2014, 5:17 PM.

Download Raw Diff

Details

Reviewers

jholewinski
eliben
meheff

Summary

Some PTX special registers are bounded per CUDA programming guide.
Leveraing the bounds of these special registers can lead to more precise
value analysis.

Add two new tests in test/Transforms/InstCombine/intrinsics.ll

Depends on D4144

Diff Detail

Event Timeline

jingyue updated this revision to Diff 10434.Jun 15 2014, 5:17 PM

jingyue retitled this revision from to [ValueTracking] Consider the bounds of PTX special registers.

jingyue updated this object.

jingyue edited the test plan for this revision. (Show Details)

jingyue added reviewers: eliben, jholewinski, meheff.

jingyue added a parent revision: D4144: [InstCombine] mark ADD with nuw if no unsigned overflow.

jingyue added a subscriber: Unknown Object (MLST).

LGTM

This revision is now accepted and ready to land.Jun 16 2014, 8:36 AM

meheff added inline comments.Jun 16 2014, 9:51 AM

lib/Analysis/ValueTracking.cpp
759	spelling nit: levaraging
778	These values seem to be CUDA version specific. Is there any way of guarding these? And how will we know to update these when CUDA N+1 is supported with different values?

Mark, I agree with your concern. I just found out we can use -target-cpu to pass the compute capacity (e.g., sm_35) to the clang frontend. I'll send out another diff. Thanks!

I should also mention that I encountered some long compilation times which are superlinear with the unroll count when experimenting with the pragma loop limit. With the current limit (32K) on a simple loop the compilation time is ~7s. Doubling the limit results in a compilation time of ~50s. It seems to be beneath llvm::UnrollLoop -> FoldBlockIntoPredecessor -> llvm::ScalarEvolution::forgetLoop.

Original Message -----

From: "Mark Heffernan" <meheff@google.com>
To: jingyue@google.com, "justin holewinski" <justin.holewinski@gmail.com>, eliben@google.com, meheff@google.com
Cc: llvm-commits@cs.uiuc.edu
Sent: Monday, June 16, 2014 2:08:29 PM
Subject: Re: [PATCH] [ValueTracking] Consider the bounds of PTX special registers

I should also mention that I encountered some long compilation times
which are superlinear with the unroll count when experimenting with
the pragma loop limit. With the current limit (32K) on a simple
loop the compilation time is ~7s. Doubling the limit results in a
compilation time of ~50s. It seems to be beneath llvm::UnrollLoop
-> FoldBlockIntoPredecessor -> llvm::ScalarEvolution::forgetLoop.

Can you please file a PR to track this issue?

Thanks,
Hal

http://reviews.llvm.org/D4150

llvm-commits mailing list
llvm-commits@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

Because the ranges of PTX special registers depend on subtarget (-target-cpu), we will have clang attach range metadata to these intrinsics and have the optimizer pick up these metadata. The second part is committed in r211281 (D4187). Will work on the first part.

jingyue mentioned this in D20644: [NVPTX] Added NVVMIntrRange pass .May 25 2016, 2:50 PM

Revision Contents

Path

Size

lib/

Analysis/

ValueTracking.cpp

23 lines

test/

Transforms/

InstCombine/

intrinsics.ll

44 lines

Diff 10434

lib/Analysis/ValueTracking.cpp

Show First 20 Lines • Show All 747 Lines • ▼ Show 20 Lines	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
case Intrinsic::ctpop: {		case Intrinsic::ctpop: {
unsigned LowBits = Log2_32(BitWidth)+1;		unsigned LowBits = Log2_32(BitWidth)+1;
KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);		KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
break;		break;
}		}
case Intrinsic::x86_sse42_crc32_64_64:		case Intrinsic::x86_sse42_crc32_64_64:
KnownZero = APInt::getHighBitsSet(64, 32);		KnownZero = APInt::getHighBitsSet(64, 32);
break;		break;
		// Some PTX special registers are bounded per CUDA programming guide
		// (http://docs.nvidia.com/cuda/cuda-c-programming-guide/
		// index.html#compute-capabilities).
		// Leveraing the bounds of these special registers can lead to more
		meheffUnsubmitted Not Done Reply Inline Actions spelling nit: levaraging meheff: spelling nit: levaraging
		// precise value analysis.
		case Intrinsic::nvvm_read_ptx_sreg_tid_x:
		case Intrinsic::nvvm_read_ptx_sreg_tid_y:
		// threadIdx.x, threadIdx.y < 1024
		KnownZero = APInt::getHighBitsSet(32, 32 - 10);
		break;
		case Intrinsic::nvvm_read_ptx_sreg_tid_z:
		// threadIdx.z < 64
		KnownZero = APInt::getHighBitsSet(32, 32 - 6);
		break;
		case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
		// blockIdx.x < 2^31
		KnownZero = APInt::getHighBitsSet(32, 32 - 31);
		break;
		case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
		case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
		// blockIdx.y, blockIdx.z < 65536
		KnownZero = APInt::getHighBitsSet(32, 32 - 16);
		break;
		meheffUnsubmitted Not Done Reply Inline Actions These values seem to be CUDA version specific. Is there any way of guarding these? And how will we know to update these when CUDA N+1 is supported with different values? meheff: These values seem to be CUDA version specific. Is there any way of guarding these? And how…
}		}
}		}
break;		break;
case Instruction::ExtractValue:		case Instruction::ExtractValue:
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {		if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
ExtractValueInst *EVI = cast<ExtractValueInst>(I);		ExtractValueInst *EVI = cast<ExtractValueInst>(I);
if (EVI->getNumIndices() != 1) break;		if (EVI->getNumIndices() != 1) break;
if (EVI->getIndices()[0] == 0) {		if (EVI->getIndices()[0] == 0) {
▲ Show 20 Lines • Show All 1,323 Lines • Show Last 20 Lines

test/Transforms/InstCombine/intrinsics.ll

; RUN: opt -instcombine -S < %s \| FileCheck %s		; RUN: opt -instcombine -S < %s \| FileCheck %s

%overflow.result = type {i8, i1}		%overflow.result = type {i8, i1}

declare %overflow.result @llvm.uadd.with.overflow.i8(i8, i8)		declare %overflow.result @llvm.uadd.with.overflow.i8(i8, i8)
declare %overflow.result @llvm.umul.with.overflow.i8(i8, i8)		declare %overflow.result @llvm.umul.with.overflow.i8(i8, i8)
declare double @llvm.powi.f64(double, i32) nounwind readonly		declare double @llvm.powi.f64(double, i32) nounwind readonly
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone		declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone		declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
declare i32 @llvm.ctpop.i32(i32) nounwind readnone		declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone		declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
		declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
		declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
		declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
		declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
		declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
		declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()

define i8 @uaddtest1(i8 %A, i8 %B) {		define i8 @uaddtest1(i8 %A, i8 %B) {
%x = call %overflow.result @llvm.uadd.with.overflow.i8(i8 %A, i8 %B)		%x = call %overflow.result @llvm.uadd.with.overflow.i8(i8 %A, i8 %B)
%y = extractvalue %overflow.result %x, 0		%y = extractvalue %overflow.result %x, 0
ret i8 %y		ret i8 %y
; CHECK-LABEL: @uaddtest1(		; CHECK-LABEL: @uaddtest1(
; CHECK-NEXT: %y = add i8 %A, %B		; CHECK-NEXT: %y = add i8 %A, %B
; CHECK-NEXT: ret i8 %y		; CHECK-NEXT: ret i8 %y
▲ Show 20 Lines • Show All 231 Lines • ▼ Show 20 Lines	define i32 @cttz_select(i32 %Value) nounwind {
%tobool = icmp ne i32 %Value, 0		%tobool = icmp ne i32 %Value, 0
%cttz = call i32 @llvm.cttz.i32(i32 %Value, i1 true)		%cttz = call i32 @llvm.cttz.i32(i32 %Value, i1 true)
%s = select i1 %tobool, i32 %cttz, i32 32		%s = select i1 %tobool, i32 %cttz, i32 32
ret i32 %s		ret i32 %s

; CHECK-LABEL: @cttz_select(		; CHECK-LABEL: @cttz_select(
; CHECK: select i1 %tobool, i32 %cttz, i32 32		; CHECK: select i1 %tobool, i32 %cttz, i32 32
}		}

		define void @nvvm_thread_idx(i32* %output_x, i32* %output_y, i32* %output_z) {
		; CHECK-LABEL: @nvvm_thread_idx(
		%tid_x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
		%tid_y = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
		%tid_z = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
		; 0 <= threadIdx.x, threadIdx.y and threadIdx.z < 2^31.
		; Therefore, add i32 threadIdx.x\|y\|z, 5 has no unsigned wrap.
		%x = add i32 %tid_x, 5
		%y = add i32 %tid_y, 5
		%z = add i32 %tid_z, 5
		; CHECK: add nuw
		; CHECK: add nuw
		; CHECK: add nuw
		store i32 %x, i32* %output_x
		store i32 %y, i32* %output_y
		store i32 %z, i32* %output_z
		ret void
		}

		define void @nvvm_block_idx(i32* %output_x, i32* %output_y, i32* %output_z) {
		; CHECK-LABEL: @nvvm_block_idx(
		%bid_x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
		%bid_y = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
		%bid_z = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
		; 0 <= blockIdx.x, blockIdx.y and blockIdx.z < 2^31.
		; Therefore, add i32 blockIdx.x\|y\|z, 5 has no unsigned wrap.
		%x = add i32 %bid_x, 5
		%y = add i32 %bid_y, 5
		%z = add i32 %bid_z, 5
		; CHECK: add nuw
		; CHECK: add nuw
		; CHECK: add nuw
		store i32 %x, i32* %output_x
		store i32 %y, i32* %output_y
		store i32 %z, i32* %output_z
		ret void
		}