[NVPTX] Improve lowering of llvm.ctlz.

Justin Lebar · Justin Lebar · commit d17de5380ba2 · 2017-01-18T00:07:35.000Z
Summary: * Disable "ctlz speculation", which inserts a branch on every ctlz(x) which has defined behavior on x == 0 to check whether x is, in fact zero. * Add DAG patterns that avoid re-truncating or re-expanding the result of the 16- and 64-bit ctz instructions. Reviewers: tra Subscribers: llvm-commits, jholewinski Differential Revision: https://reviews.llvm.org/D28719 llvm-svn: 292299
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -517,6 +517,12 @@ class NVPTXTargetLowering : public TargetLowering {
 
   bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
 
+  // The default is to transform llvm.ctlz(x, false) (where false indicates that
+  // x == 0 is not undefined behavior) into a branch that checks whether x is 0
+  // and avoids calling ctlz in that case.  We have a dedicated ctlz
+  // instruction, so we say that ctlz is cheap to speculate.
+  bool isCheapToSpeculateCtlz() const override { return true; }
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2774,18 +2774,32 @@ let hasSideEffects = 0 in {
 // 32-bit has a direct PTX instruction
 def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
 
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// The return type of the ctlz ISD node is the same as its input, but the PTX
+// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
+// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
+// truncating back down to 32 bits.
 def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
 
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
-// than 16 bits to store). We also need to subtract 16 because the
-// high-order 16 zeros were counted.
+// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
+// result back to 16-bits if necessary.  We also need to subtract 16 because
+// the high-order 16 zeros were counted.
+//
+// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
+// use to save one SASS instruction (on sm_35 anyway):
+//
+//   mov.b32 $tmp, {0xffff, $a}
+//   ctlz.b32 $result, $tmp
+//
+// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
+// and then ctlz that value.  This way we don't have to subtract 16 from the
+// result.  Unfortunately today we don't have a way to generate
+// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
 def : Pat<(ctlz Int16Regs:$a),
-          (SUBi16ri (CVT_u16_u32 (CLZr32
-            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
-           CvtNONE), 16)>;
+          (SUBi16ri (CVT_u16_u32
+           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
 
 // Population count
 let hasSideEffects = 0 in {
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -6,39 +6,127 @@ declare i16 @llvm.ctlz.i16(i16, i1) readnone
 declare i32 @llvm.ctlz.i32(i32, i1) readnone
 declare i64 @llvm.ctlz.i64(i64, i1) readnone
 
+; There should be no difference between llvm.ctlz.i32(%a, true) and
+; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
+
+; CHECK-LABEL: myctpop(
 define i32 @myctpop(i32 %a) {
-; CHECK: clz.b32
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
   %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
   ret i32 %val
 }
-
-define i16 @myctpop16(i16 %a) {
-; CHECK: clz.b32
-  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
-  ret i16 %val
+; CHECK-LABEL: myctpop_2(
+define i32 @myctpop_2(i32 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
+  ret i32 %val
 }
 
+; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
+; value, so here we have to zero-extend it.
+; CHECK-LABEL: myctpop64(
 define i64 @myctpop64(i64 %a) {
-; CHECK: clz.b64
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
   %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
   ret i64 %val
 }
+; CHECK-LABEL: myctpop64_2(
+define i64 @myctpop64_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
+  ret i64 %val
+}
 
-
-define i32 @myctpop_2(i32 %a) {
-; CHECK: clz.b32
-  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
-  ret i32 %val
+; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
+; natural return width of ptx's clz.b64 instruction.  No conversions should be
+; necessary in the PTX.
+; CHECK-LABEL: myctpop64_as_32(
+define i32 @myctpop64_as_32(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
+}
+; CHECK-LABEL: myctpop64_as_32_2(
+define i32 @myctpop64_as_32_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
 }
 
-define i16 @myctpop16_2(i16 %a) {
-; CHECK: clz.b32
+; ctlz.i16 is implemented by extending the input to i32, computing the result,
+; and then truncating the result back down to i16.  But the NVPTX ABI
+; zero-extends i16 return values to i32, so the final truncation doesn't appear
+; in this function.
+; CHECK-LABEL: myctpop_ret16(
+define i16 @myctpop_ret16(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  ret i16 %val
+}
+; CHECK-LABEL: myctpop_ret16_2(
+define i16 @myctpop_ret16_2(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
   ret i16 %val
 }
 
-define i64 @myctpop64_2(i64 %a) {
-; CHECK: clz.b64
-  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
-  ret i64 %val
+; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
+; remain.
+; CHECK-LABEL: myctpop_store16(
+define void @myctpop_store16(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  store i16 %val, i16* %b
+  ret void
+}
+; CHECK-LABEL: myctpop_store16_2(
+define void @myctpop_store16_2(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  store i16 %val, i16* %b
+  ret void
 }