Skip to content

Commit d17de53

Browse files
author
Justin Lebar
committedJan 18, 2017
[NVPTX] Improve lowering of llvm.ctlz.
Summary: * Disable "ctlz speculation", which inserts a branch on every ctlz(x) which has defined behavior on x == 0 to check whether x is, in fact zero. * Add DAG patterns that avoid re-truncating or re-expanding the result of the 16- and 64-bit ctz instructions. Reviewers: tra Subscribers: llvm-commits, jholewinski Differential Revision: https://reviews.llvm.org/D28719 llvm-svn: 292299
1 parent 3313905 commit d17de53

File tree

3 files changed

+135
-27
lines changed

3 files changed

+135
-27
lines changed
 

‎llvm/lib/Target/NVPTX/NVPTXISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,12 @@ class NVPTXTargetLowering : public TargetLowering {
517517

518518
bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
519519

520+
// The default is to transform llvm.ctlz(x, false) (where false indicates that
521+
// x == 0 is not undefined behavior) into a branch that checks whether x is 0
522+
// and avoids calling ctlz in that case. We have a dedicated ctlz
523+
// instruction, so we say that ctlz is cheap to speculate.
524+
bool isCheapToSpeculateCtlz() const override { return true; }
525+
520526
private:
521527
const NVPTXSubtarget &STI; // cache the subtarget here
522528
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;

‎llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2774,18 +2774,32 @@ let hasSideEffects = 0 in {
27742774
// 32-bit has a direct PTX instruction
27752775
def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
27762776

2777-
// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
2778-
// to 64-bit to match the LLVM semantics
2777+
// The return type of the ctlz ISD node is the same as its input, but the PTX
2778+
// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
2779+
// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
2780+
// truncating back down to 32 bits.
27792781
def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
2782+
def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
27802783

2781-
// For 16-bit, we zero-extend to 32-bit, then trunc the result back
2782-
// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
2783-
// than 16 bits to store). We also need to subtract 16 because the
2784-
// high-order 16 zeros were counted.
2784+
// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
2785+
// result back to 16-bits if necessary. We also need to subtract 16 because
2786+
// the high-order 16 zeros were counted.
2787+
//
2788+
// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
2789+
// use to save one SASS instruction (on sm_35 anyway):
2790+
//
2791+
// mov.b32 $tmp, {0xffff, $a}
2792+
// ctlz.b32 $result, $tmp
2793+
//
2794+
// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
2795+
// and then ctlz that value. This way we don't have to subtract 16 from the
2796+
// result. Unfortunately today we don't have a way to generate
2797+
// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
27852798
def : Pat<(ctlz Int16Regs:$a),
2786-
(SUBi16ri (CVT_u16_u32 (CLZr32
2787-
(CVT_u32_u16 Int16Regs:$a, CvtNONE)),
2788-
CvtNONE), 16)>;
2799+
(SUBi16ri (CVT_u16_u32
2800+
(CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
2801+
def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
2802+
(SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
27892803

27902804
// Population count
27912805
let hasSideEffects = 0 in {

‎llvm/test/CodeGen/NVPTX/ctlz.ll

Lines changed: 106 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,39 +6,127 @@ declare i16 @llvm.ctlz.i16(i16, i1) readnone
66
declare i32 @llvm.ctlz.i32(i32, i1) readnone
77
declare i64 @llvm.ctlz.i64(i64, i1) readnone
88

9+
; There should be no difference between llvm.ctlz.i32(%a, true) and
10+
; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
11+
12+
; CHECK-LABEL: myctpop(
913
define i32 @myctpop(i32 %a) {
10-
; CHECK: clz.b32
14+
; CHECK: ld.param.
15+
; CHECK-NEXT: clz.b32
16+
; CHECK-NEXT: st.param.
17+
; CHECK-NEXT: ret;
1118
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
1219
ret i32 %val
1320
}
14-
15-
define i16 @myctpop16(i16 %a) {
16-
; CHECK: clz.b32
17-
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
18-
ret i16 %val
21+
; CHECK-LABEL: myctpop_2(
22+
define i32 @myctpop_2(i32 %a) {
23+
; CHECK: ld.param.
24+
; CHECK-NEXT: clz.b32
25+
; CHECK-NEXT: st.param.
26+
; CHECK-NEXT: ret;
27+
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
28+
ret i32 %val
1929
}
2030

31+
; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
32+
; value, so here we have to zero-extend it.
33+
; CHECK-LABEL: myctpop64(
2134
define i64 @myctpop64(i64 %a) {
22-
; CHECK: clz.b64
35+
; CHECK: ld.param.
36+
; CHECK-NEXT: clz.b64
37+
; CHECK-NEXT: cvt.u64.u32
38+
; CHECK-NEXT: st.param.
39+
; CHECK-NEXT: ret;
2340
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
2441
ret i64 %val
2542
}
43+
; CHECK-LABEL: myctpop64_2(
44+
define i64 @myctpop64_2(i64 %a) {
45+
; CHECK: ld.param.
46+
; CHECK-NEXT: clz.b64
47+
; CHECK-NEXT: cvt.u64.u32
48+
; CHECK-NEXT: st.param.
49+
; CHECK-NEXT: ret;
50+
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
51+
ret i64 %val
52+
}
2653

27-
28-
define i32 @myctpop_2(i32 %a) {
29-
; CHECK: clz.b32
30-
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
31-
ret i32 %val
54+
; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
55+
; natural return width of ptx's clz.b64 instruction. No conversions should be
56+
; necessary in the PTX.
57+
; CHECK-LABEL: myctpop64_as_32(
58+
define i32 @myctpop64_as_32(i64 %a) {
59+
; CHECK: ld.param.
60+
; CHECK-NEXT: clz.b64
61+
; CHECK-NEXT: st.param.
62+
; CHECK-NEXT: ret;
63+
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
64+
%trunc = trunc i64 %val to i32
65+
ret i32 %trunc
66+
}
67+
; CHECK-LABEL: myctpop64_as_32_2(
68+
define i32 @myctpop64_as_32_2(i64 %a) {
69+
; CHECK: ld.param.
70+
; CHECK-NEXT: clz.b64
71+
; CHECK-NEXT: st.param.
72+
; CHECK-NEXT: ret;
73+
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
74+
%trunc = trunc i64 %val to i32
75+
ret i32 %trunc
3276
}
3377

34-
define i16 @myctpop16_2(i16 %a) {
35-
; CHECK: clz.b32
78+
; ctlz.i16 is implemented by extending the input to i32, computing the result,
79+
; and then truncating the result back down to i16. But the NVPTX ABI
80+
; zero-extends i16 return values to i32, so the final truncation doesn't appear
81+
; in this function.
82+
; CHECK-LABEL: myctpop_ret16(
83+
define i16 @myctpop_ret16(i16 %a) {
84+
; CHECK: ld.param.
85+
; CHECK-NEXT: cvt.u32.u16
86+
; CHECK-NEXT: clz.b32
87+
; CHECK-NEXT: sub.
88+
; CHECK-NEXT: st.param.
89+
; CHECK-NEXT: ret;
90+
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
91+
ret i16 %val
92+
}
93+
; CHECK-LABEL: myctpop_ret16_2(
94+
define i16 @myctpop_ret16_2(i16 %a) {
95+
; CHECK: ld.param.
96+
; CHECK-NEXT: cvt.u32.u16
97+
; CHECK-NEXT: clz.b32
98+
; CHECK-NEXT: sub.
99+
; CHECK-NEXT: st.param.
100+
; CHECK-NEXT: ret;
36101
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
37102
ret i16 %val
38103
}
39104

40-
define i64 @myctpop64_2(i64 %a) {
41-
; CHECK: clz.b64
42-
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
43-
ret i64 %val
105+
; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
106+
; remain.
107+
; CHECK-LABEL: myctpop_store16(
108+
define void @myctpop_store16(i16 %a, i16* %b) {
109+
; CHECK: ld.param.
110+
; CHECK-NEXT: cvt.u32.u16
111+
; CHECK-NET: clz.b32
112+
; CHECK-DAG: cvt.u16.u32
113+
; CHECK-DAG: sub.
114+
; CHECK: st.{{[a-z]}}16
115+
; CHECK: ret;
116+
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
117+
store i16 %val, i16* %b
118+
ret void
119+
}
120+
; CHECK-LABEL: myctpop_store16_2(
121+
define void @myctpop_store16_2(i16 %a, i16* %b) {
122+
; CHECK: ld.param.
123+
; CHECK-NEXT: cvt.u32.u16
124+
; CHECK-NET: clz.b32
125+
; CHECK-DAG: cvt.u16.u32
126+
; CHECK-DAG: sub.
127+
; CHECK: st.{{[a-z]}}16
128+
; CHECK: ret;
129+
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
130+
store i16 %val, i16* %b
131+
ret void
44132
}

0 commit comments

Comments
 (0)
Please sign in to comment.