Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -517,6 +517,12 @@ bool enableAggressiveFMAFusion(EVT VT) const override { return true; } + // The default is to transform llvm.ctlz(x, false) (where false indicates that + // x == 0 is not undefined behavior) into a branch that checks whether x is 0 + // and avoids calling ctlz in that case. We have a dedicated ctlz + // instruction, so we say that ctlz is cheap to speculate. + bool isCheapToSpeculateCtlz() const override { return true; } + private: const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2774,18 +2774,32 @@ // 32-bit has a direct PTX instruction def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; -// For 64-bit, the result in PTX is actually 32-bit so we zero-extend -// to 64-bit to match the LLVM semantics +// The return type of the ctlz ISD node is the same as its input, but the PTX +// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the +// ptx value to 64 bits to match the ISD node's semantics, unless we know we're +// truncating back down to 32 bits. def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; -// For 16-bit, we zero-extend to 32-bit, then trunc the result back -// to 16-bits (ctlz of a 16-bit value is guaranteed to require less -// than 16 bits to store). We also need to subtract 16 because the -// high-order 16 zeros were counted. +// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the +// result back to 16-bits if necessary. We also need to subtract 16 because +// the high-order 16 zeros were counted. +// +// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could +// use to save one SASS instruction (on sm_35 anyway): +// +// mov.b32 $tmp, {0xffff, $a} +// ctlz.b32 $result, $tmp +// +// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" +// and then ctlz that value. This way we don't have to subtract 16 from the +// result. Unfortunately today we don't have a way to generate +// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. def : Pat<(ctlz Int16Regs:$a), - (SUBi16ri (CVT_u16_u32 (CLZr32 - (CVT_u32_u16 Int16Regs:$a, CvtNONE)), - CvtNONE), 16)>; + (SUBi16ri (CVT_u16_u32 + (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; +def : Pat<(i32 (zext (ctlz Int16Regs:$a))), + (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; // Population count let hasSideEffects = 0 in { Index: llvm/test/CodeGen/NVPTX/ctlz.ll =================================================================== --- llvm/test/CodeGen/NVPTX/ctlz.ll +++ llvm/test/CodeGen/NVPTX/ctlz.ll @@ -6,39 +6,127 @@ declare i32 @llvm.ctlz.i32(i32, i1) readnone declare i64 @llvm.ctlz.i64(i64, i1) readnone +; There should be no difference between llvm.ctlz.i32(%a, true) and +; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0. + +; CHECK-LABEL: myctpop( define i32 @myctpop(i32 %a) { -; CHECK: clz.b32 +; CHECK: ld.param. +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone ret i32 %val } - -define i16 @myctpop16(i16 %a) { -; CHECK: clz.b32 - %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone - ret i16 %val -} - -define i64 @myctpop64(i64 %a) { -; CHECK: clz.b64 - %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone - ret i64 %val -} - - +; CHECK-LABEL: myctpop_2( define i32 @myctpop_2(i32 %a) { -; CHECK: clz.b32 +; CHECK: ld.param. +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone ret i32 %val } -define i16 @myctpop16_2(i16 %a) { -; CHECK: clz.b32 +; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit +; value, so here we have to zero-extend it. +; CHECK-LABEL: myctpop64( +define i64 @myctpop64(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: cvt.u64.u32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone + ret i64 %val +} +; CHECK-LABEL: myctpop64_2( +define i64 @myctpop64_2(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: cvt.u64.u32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone + ret i64 %val +} + +; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the +; natural return width of ptx's clz.b64 instruction. No conversions should be +; necessary in the PTX. +; CHECK-LABEL: myctpop64_as_32( +define i32 @myctpop64_as_32(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone + %trunc = trunc i64 %val to i32 + ret i32 %trunc +} +; CHECK-LABEL: myctpop64_as_32_2( +define i32 @myctpop64_as_32_2(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone + %trunc = trunc i64 %val to i32 + ret i32 %trunc +} + +; ctlz.i16 is implemented by extending the input to i32, computing the result, +; and then truncating the result back down to i16. But the NVPTX ABI +; zero-extends i16 return values to i32, so the final truncation doesn't appear +; in this function. +; CHECK-LABEL: myctpop_ret16( +define i16 @myctpop_ret16(i16 %a) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: sub. +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone + ret i16 %val +} +; CHECK-LABEL: myctpop_ret16_2( +define i16 @myctpop_ret16_2(i16 %a) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: sub. +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone ret i16 %val } -define i64 @myctpop64_2(i64 %a) { -; CHECK: clz.b64 - %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone - ret i64 %val +; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should +; remain. +; CHECK-LABEL: myctpop_store16( +define void @myctpop_store16(i16 %a, i16* %b) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NET: clz.b32 +; CHECK-DAG: cvt.u16.u32 +; CHECK-DAG: sub. +; CHECK: st.{{[a-z]}}16 +; CHECK: ret; + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone + store i16 %val, i16* %b + ret void +} +; CHECK-LABEL: myctpop_store16_2( +define void @myctpop_store16_2(i16 %a, i16* %b) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NET: clz.b32 +; CHECK-DAG: cvt.u16.u32 +; CHECK-DAG: sub. +; CHECK: st.{{[a-z]}}16 +; CHECK: ret; + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone + store i16 %val, i16* %b + ret void }