@@ -6,39 +6,127 @@ declare i16 @llvm.ctlz.i16(i16, i1) readnone
6
6
declare i32 @llvm.ctlz.i32 (i32 , i1 ) readnone
7
7
declare i64 @llvm.ctlz.i64 (i64 , i1 ) readnone
8
8
9
+ ; There should be no difference between llvm.ctlz.i32(%a, true) and
10
+ ; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
11
+
12
+ ; CHECK-LABEL: myctpop(
9
13
define i32 @myctpop (i32 %a ) {
10
- ; CHECK: clz.b32
14
+ ; CHECK: ld.param.
15
+ ; CHECK-NEXT: clz.b32
16
+ ; CHECK-NEXT: st.param.
17
+ ; CHECK-NEXT: ret;
11
18
%val = call i32 @llvm.ctlz.i32 (i32 %a , i1 false ) readnone
12
19
ret i32 %val
13
20
}
14
-
15
- define i16 @myctpop16 (i16 %a ) {
16
- ; CHECK: clz.b32
17
- %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
18
- ret i16 %val
21
+ ; CHECK-LABEL: myctpop_2(
22
+ define i32 @myctpop_2 (i32 %a ) {
23
+ ; CHECK: ld.param.
24
+ ; CHECK-NEXT: clz.b32
25
+ ; CHECK-NEXT: st.param.
26
+ ; CHECK-NEXT: ret;
27
+ %val = call i32 @llvm.ctlz.i32 (i32 %a , i1 true ) readnone
28
+ ret i32 %val
19
29
}
20
30
31
+ ; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
32
+ ; value, so here we have to zero-extend it.
33
+ ; CHECK-LABEL: myctpop64(
21
34
define i64 @myctpop64 (i64 %a ) {
22
- ; CHECK: clz.b64
35
+ ; CHECK: ld.param.
36
+ ; CHECK-NEXT: clz.b64
37
+ ; CHECK-NEXT: cvt.u64.u32
38
+ ; CHECK-NEXT: st.param.
39
+ ; CHECK-NEXT: ret;
23
40
%val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
24
41
ret i64 %val
25
42
}
43
+ ; CHECK-LABEL: myctpop64_2(
44
+ define i64 @myctpop64_2 (i64 %a ) {
45
+ ; CHECK: ld.param.
46
+ ; CHECK-NEXT: clz.b64
47
+ ; CHECK-NEXT: cvt.u64.u32
48
+ ; CHECK-NEXT: st.param.
49
+ ; CHECK-NEXT: ret;
50
+ %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 true ) readnone
51
+ ret i64 %val
52
+ }
26
53
27
-
28
- define i32 @myctpop_2 (i32 %a ) {
29
- ; CHECK: clz.b32
30
- %val = call i32 @llvm.ctlz.i32 (i32 %a , i1 true ) readnone
31
- ret i32 %val
54
+ ; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
55
+ ; natural return width of ptx's clz.b64 instruction. No conversions should be
56
+ ; necessary in the PTX.
57
+ ; CHECK-LABEL: myctpop64_as_32(
58
+ define i32 @myctpop64_as_32 (i64 %a ) {
59
+ ; CHECK: ld.param.
60
+ ; CHECK-NEXT: clz.b64
61
+ ; CHECK-NEXT: st.param.
62
+ ; CHECK-NEXT: ret;
63
+ %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
64
+ %trunc = trunc i64 %val to i32
65
+ ret i32 %trunc
66
+ }
67
+ ; CHECK-LABEL: myctpop64_as_32_2(
68
+ define i32 @myctpop64_as_32_2 (i64 %a ) {
69
+ ; CHECK: ld.param.
70
+ ; CHECK-NEXT: clz.b64
71
+ ; CHECK-NEXT: st.param.
72
+ ; CHECK-NEXT: ret;
73
+ %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
74
+ %trunc = trunc i64 %val to i32
75
+ ret i32 %trunc
32
76
}
33
77
34
- define i16 @myctpop16_2 (i16 %a ) {
35
- ; CHECK: clz.b32
78
+ ; ctlz.i16 is implemented by extending the input to i32, computing the result,
79
+ ; and then truncating the result back down to i16. But the NVPTX ABI
80
+ ; zero-extends i16 return values to i32, so the final truncation doesn't appear
81
+ ; in this function.
82
+ ; CHECK-LABEL: myctpop_ret16(
83
+ define i16 @myctpop_ret16 (i16 %a ) {
84
+ ; CHECK: ld.param.
85
+ ; CHECK-NEXT: cvt.u32.u16
86
+ ; CHECK-NEXT: clz.b32
87
+ ; CHECK-NEXT: sub.
88
+ ; CHECK-NEXT: st.param.
89
+ ; CHECK-NEXT: ret;
90
+ %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
91
+ ret i16 %val
92
+ }
93
+ ; CHECK-LABEL: myctpop_ret16_2(
94
+ define i16 @myctpop_ret16_2 (i16 %a ) {
95
+ ; CHECK: ld.param.
96
+ ; CHECK-NEXT: cvt.u32.u16
97
+ ; CHECK-NEXT: clz.b32
98
+ ; CHECK-NEXT: sub.
99
+ ; CHECK-NEXT: st.param.
100
+ ; CHECK-NEXT: ret;
36
101
%val = call i16 @llvm.ctlz.i16 (i16 %a , i1 true ) readnone
37
102
ret i16 %val
38
103
}
39
104
40
- define i64 @myctpop64_2 (i64 %a ) {
41
- ; CHECK: clz.b64
42
- %val = call i64 @llvm.ctlz.i64 (i64 %a , i1 true ) readnone
43
- ret i64 %val
105
+ ; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
106
+ ; remain.
107
+ ; CHECK-LABEL: myctpop_store16(
108
+ define void @myctpop_store16 (i16 %a , i16* %b ) {
109
+ ; CHECK: ld.param.
110
+ ; CHECK-NEXT: cvt.u32.u16
111
+ ; CHECK-NET: clz.b32
112
+ ; CHECK-DAG: cvt.u16.u32
113
+ ; CHECK-DAG: sub.
114
+ ; CHECK: st.{{[a-z]}}16
115
+ ; CHECK: ret;
116
+ %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
117
+ store i16 %val , i16* %b
118
+ ret void
119
+ }
120
+ ; CHECK-LABEL: myctpop_store16_2(
121
+ define void @myctpop_store16_2 (i16 %a , i16* %b ) {
122
+ ; CHECK: ld.param.
123
+ ; CHECK-NEXT: cvt.u32.u16
124
+ ; CHECK-NET: clz.b32
125
+ ; CHECK-DAG: cvt.u16.u32
126
+ ; CHECK-DAG: sub.
127
+ ; CHECK: st.{{[a-z]}}16
128
+ ; CHECK: ret;
129
+ %val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
130
+ store i16 %val , i16* %b
131
+ ret void
44
132
}
0 commit comments