Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -543,6 +543,27 @@ setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + // NEON does not have single instruction CTTZ for vectors. + setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); @@ -4282,8 +4303,82 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); SDLoc dl(N); + EVT VT = N->getValueType(0); + if (VT.isVector()) { + assert(ST->hasNEON()); + + // Compute the least significant set bit: LSB = X & -X + SDValue X = N->getOperand(0); + SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); + + EVT ElemTy = VT.getVectorElementType(); + + if (ElemTy == MVT::i8) { + // Compute with: cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); + } + + if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && + (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { + // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 + unsigned NumBits = ElemTy.getSizeInBits(); + SDValue WidthMinus1 = + DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); + SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); + } + + // Compute with: cttz(x) = ctpop(lsb - 1) + + // Since we can only compute the number of bits in a byte with vcnt.8, we + // have to gather the result with pairwise addition (vpaddl) for i16, i32, + // and i64. + + // Compute LSB - 1. + SDValue Bits; + if (ElemTy == MVT::i64) { + // Load constant 0xffff'ffff'ffff'ffff to register. + SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(0x1eff, dl, MVT::i32)); + Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); + } else { + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + } + + // Count #bits with vcnt.8. + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); + SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); + + // Gather the #bits with vpaddl (pairwise add.) + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt8); + if (ElemTy == MVT::i16) + return Cnt16; + + EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; + SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt16); + if (ElemTy == MVT::i32) + return Cnt32; + + assert(ElemTy == MVT::i64); + SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt32); + return Cnt64; + } if (!ST->hasV6T2Ops()) return SDValue(); @@ -6487,7 +6582,8 @@ case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); - case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); Index: test/CodeGen/ARM/cttz.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/cttz.ll @@ -0,0 +1,90 @@ +; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 | FileCheck %s +; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 -mattr=+neon | FileCheck %s + +; This test checks the @llvm.cttz.* intrinsics for integers. + +declare i8 @llvm.cttz.i8(i8, i1) +declare i16 @llvm.cttz.i16(i16, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare i64 @llvm.cttz.i64(i64, i1) + +;------------------------------------------------------------------------------ + +define i8 @test_i8(i8 %a) { +; CHECK-LABEL: test_i8: +; CHECK: orr [[REG:r[0-9]+]], [[REG]], #256 +; CHECK: rbit +; CHECK: clz + %tmp = call i8 @llvm.cttz.i8(i8 %a, i1 false) + ret i8 %tmp +} + +define i16 @test_i16(i16 %a) { +; CHECK-LABEL: test_i16: +; CHECK: orr [[REG:r[0-9]+]], [[REG]], #65536 +; CHECK: rbit +; CHECK: clz + %tmp = call i16 @llvm.cttz.i16(i16 %a, i1 false) + ret i16 %tmp +} + +define i32 @test_i32(i32 %a) { +; CHECK-LABEL: test_i32: +; CHECK: rbit +; CHECK: clz + %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 false) + ret i32 %tmp +} + +define i64 @test_i64(i64 %a) { +; CHECK-LABEL: test_i64: +; CHECK: rbit +; CHECK: rbit +; CHECK: cmp +; CHECK: clz +; CHECK: add +; CHECK: clzne + %tmp = call i64 @llvm.cttz.i64(i64 %a, i1 false) + ret i64 %tmp +} + +;------------------------------------------------------------------------------ + +define i8 @test_i8_zero_undef(i8 %a) { +; CHECK-LABEL: test_i8_zero_undef: +; CHECK-NOT: orr +; CHECK: rbit +; CHECK: clz + %tmp = call i8 @llvm.cttz.i8(i8 %a, i1 true) + ret i8 %tmp +} + +define i16 @test_i16_zero_undef(i16 %a) { +; CHECK-LABEL: test_i16_zero_undef: +; CHECK-NOT: orr +; CHECK: rbit +; CHECK: clz + %tmp = call i16 @llvm.cttz.i16(i16 %a, i1 true) + ret i16 %tmp +} + + +define i32 @test_i32_zero_undef(i32 %a) { +; CHECK-LABEL: test_i32_zero_undef: +; CHECK: rbit +; CHECK: clz + %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 true) + ret i32 %tmp +} + +define i64 @test_i64_zero_undef(i64 %a) { +; CHECK-LABEL: test_i64_zero_undef: +; CHECK: rbit +; CHECK: rbit +; CHECK: cmp +; CHECK: clz +; CHECK: add +; CHECK: clzne + %tmp = call i64 @llvm.cttz.i64(i64 %a, i1 true) + ret i64 %tmp +} Index: test/CodeGen/ARM/cttz_vector.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/cttz_vector.ll @@ -0,0 +1,383 @@ +; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s + +; This test checks the @llvm.cttz.* intrinsics for vectors. + +declare <1 x i8> @llvm.cttz.v1i8(<1 x i8>, i1) +declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1) +declare <4 x i8> @llvm.cttz.v4i8(<4 x i8>, i1) +declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>, i1) +declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1) + +declare <1 x i16> @llvm.cttz.v1i16(<1 x i16>, i1) +declare <2 x i16> @llvm.cttz.v2i16(<2 x i16>, i1) +declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>, i1) +declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1) + +declare <1 x i32> @llvm.cttz.v1i32(<1 x i32>, i1) +declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) +declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) + +declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>, i1) +declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) + +;------------------------------------------------------------------------------ + +define void @test_v1i8(<1 x i8>* %p) { +; CHECK-LABEL: test_v1i8 + %a = load <1 x i8>, <1 x i8>* %p + %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false) + store <1 x i8> %tmp, <1 x i8>* %p + ret void +} + +define void @test_v2i8(<2 x i8>* %p) { +; CHECK-LABEL: test_v2i8: + %a = load <2 x i8>, <2 x i8>* %p + %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false) + store <2 x i8> %tmp, <2 x i8>* %p + ret void +} + +define void @test_v4i8(<4 x i8>* %p) { +; CHECK-LABEL: test_v4i8: + %a = load <4 x i8>, <4 x i8>* %p + %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false) + store <4 x i8> %tmp, <4 x i8>* %p + ret void +} + +define void @test_v8i8(<8 x i8>* %p) { +; CHECK-LABEL: test_v8i8: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1 +; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D3]] +; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]] +; CHECK: vcnt.8 [[D1]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <8 x i8>, <8 x i8>* %p + %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false) + store <8 x i8> %tmp, <8 x i8>* %p + ret void +} + +define void @test_v16i8(<16 x i8>* %p) { +; CHECK-LABEL: test_v16i8: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1 +; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q3]] +; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]] +; CHECK: vcnt.8 [[Q1]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <16 x i8>, <16 x i8>* %p + %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false) + store <16 x i8> %tmp, <16 x i8>* %p + ret void +} + +define void @test_v1i16(<1 x i16>* %p) { +; CHECK-LABEL: test_v1i16: + %a = load <1 x i16>, <1 x i16>* %p + %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false) + store <1 x i16> %tmp, <1 x i16>* %p + ret void +} + +define void @test_v2i16(<2 x i16>* %p) { +; CHECK-LABEL: test_v2i16: + %a = load <2 x i16>, <2 x i16>* %p + %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false) + store <2 x i16> %tmp, <2 x i16>* %p + ret void +} + +define void @test_v4i16(<4 x i16>* %p) { +; CHECK-LABEL: test_v4i16: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vmov.i16 [[D2:d[0-9]+]], #0x1 +; CHECK: vneg.s16 [[D3:d[0-9]+]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D3]] +; CHECK: vsub.i16 [[D1]], [[D1]], [[D2]] +; CHECK: vcnt.8 [[D1]], [[D1]] +; CHECK: vpaddl.u8 [[D1]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <4 x i16>, <4 x i16>* %p + %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false) + store <4 x i16> %tmp, <4 x i16>* %p + ret void +} + +define void @test_v8i16(<8 x i16>* %p) { +; CHECK-LABEL: test_v8i16: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vmov.i16 [[Q2:q[0-9]+]], #0x1 +; CHECK: vneg.s16 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q3]] +; CHECK: vsub.i16 [[Q1]], [[Q1]], [[Q2]] +; CHECK: vcnt.8 [[Q1]], [[Q1]] +; CHECK: vpaddl.u8 [[Q1]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <8 x i16>, <8 x i16>* %p + %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false) + store <8 x i16> %tmp, <8 x i16>* %p + ret void +} + +define void @test_v1i32(<1 x i32>* %p) { +; CHECK-LABEL: test_v1i32: + %a = load <1 x i32>, <1 x i32>* %p + %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false) + store <1 x i32> %tmp, <1 x i32>* %p + ret void +} + +define void @test_v2i32(<2 x i32>* %p) { +; CHECK-LABEL: test_v2i32: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x1 +; CHECK: vneg.s32 [[D3:d[0-9]+]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D3]] +; CHECK: vsub.i32 [[D1]], [[D1]], [[D2]] +; CHECK: vcnt.8 [[D1]], [[D1]] +; CHECK: vpaddl.u8 [[D1]], [[D1]] +; CHECK: vpaddl.u16 [[D1]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <2 x i32>, <2 x i32>* %p + %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) + store <2 x i32> %tmp, <2 x i32>* %p + ret void +} + +define void @test_v4i32(<4 x i32>* %p) { +; CHECK-LABEL: test_v4i32: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x1 +; CHECK: vneg.s32 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q3]] +; CHECK: vsub.i32 [[Q1]], [[Q1]], [[Q2]] +; CHECK: vcnt.8 [[Q1]], [[Q1]] +; CHECK: vpaddl.u8 [[Q1]], [[Q1]] +; CHECK: vpaddl.u16 [[Q1]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <4 x i32>, <4 x i32>* %p + %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) + store <4 x i32> %tmp, <4 x i32>* %p + ret void +} + +define void @test_v1i64(<1 x i64>* %p) { +; CHECK-LABEL: test_v1i64: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0 +; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff +; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D2]] +; CHECK: vadd.i64 [[D1]], [[D1]], [[D3]] +; CHECK: vcnt.8 [[D1]], [[D1]] +; CHECK: vpaddl.u8 [[D1]], [[D1]] +; CHECK: vpaddl.u16 [[D1]], [[D1]] +; CHECK: vpaddl.u32 [[D1]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <1 x i64>, <1 x i64>* %p + %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false) + store <1 x i64> %tmp, <1 x i64>* %p + ret void +} + +define void @test_v2i64(<2 x i64>* %p) { +; CHECK-LABEL: test_v2i64: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0 +; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff +; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q2]] +; CHECK: vadd.i64 [[Q1]], [[Q1]], [[Q3]] +; CHECK: vcnt.8 [[Q1]], [[Q1]] +; CHECK: vpaddl.u8 [[Q1]], [[Q1]] +; CHECK: vpaddl.u16 [[Q1]], [[Q1]] +; CHECK: vpaddl.u32 [[Q1]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <2 x i64>, <2 x i64>* %p + %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false) + store <2 x i64> %tmp, <2 x i64>* %p + ret void +} + +;------------------------------------------------------------------------------ + +define void @test_v1i8_zero_undef(<1 x i8>* %p) { +; CHECK-LABEL: test_v1i8_zero_undef + %a = load <1 x i8>, <1 x i8>* %p + %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true) + store <1 x i8> %tmp, <1 x i8>* %p + ret void +} + +define void @test_v2i8_zero_undef(<2 x i8>* %p) { +; CHECK-LABEL: test_v2i8_zero_undef: + %a = load <2 x i8>, <2 x i8>* %p + %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) + store <2 x i8> %tmp, <2 x i8>* %p + ret void +} + +define void @test_v4i8_zero_undef(<4 x i8>* %p) { +; CHECK-LABEL: test_v4i8_zero_undef: + %a = load <4 x i8>, <4 x i8>* %p + %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) + store <4 x i8> %tmp, <4 x i8>* %p + ret void +} + +define void @test_v8i8_zero_undef(<8 x i8>* %p) { +; CHECK-LABEL: test_v8i8_zero_undef: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1 +; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D3]] +; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]] +; CHECK: vcnt.8 [[D1]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <8 x i8>, <8 x i8>* %p + %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) + store <8 x i8> %tmp, <8 x i8>* %p + ret void +} + +define void @test_v16i8_zero_undef(<16 x i8>* %p) { +; CHECK-LABEL: test_v16i8_zero_undef: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1 +; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q3]] +; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]] +; CHECK: vcnt.8 [[Q1]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <16 x i8>, <16 x i8>* %p + %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) + store <16 x i8> %tmp, <16 x i8>* %p + ret void +} + +define void @test_v1i16_zero_undef(<1 x i16>* %p) { +; CHECK-LABEL: test_v1i16_zero_undef: + %a = load <1 x i16>, <1 x i16>* %p + %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true) + store <1 x i16> %tmp, <1 x i16>* %p + ret void +} + +define void @test_v2i16_zero_undef(<2 x i16>* %p) { +; CHECK-LABEL: test_v2i16_zero_undef: + %a = load <2 x i16>, <2 x i16>* %p + %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) + store <2 x i16> %tmp, <2 x i16>* %p + ret void +} + +define void @test_v4i16_zero_undef(<4 x i16>* %p) { +; CHECK-LABEL: test_v4i16_zero_undef: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vneg.s16 [[D2:d[0-9]+]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D2]] +; CHECK: vmov.i16 [[D3:d[0-9]+]], #0xf +; CHECK: vclz.i16 [[D1]], [[D1]] +; CHECK: vsub.i16 [[D1]], [[D3]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <4 x i16>, <4 x i16>* %p + %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) + store <4 x i16> %tmp, <4 x i16>* %p + ret void +} + +define void @test_v8i16_zero_undef(<8 x i16>* %p) { +; CHECK-LABEL: test_v8i16_zero_undef: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vneg.s16 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q2]] +; CHECK: vmov.i16 [[Q3:q[0-9]+]], #0xf +; CHECK: vclz.i16 [[Q1]], [[Q1]] +; CHECK: vsub.i16 [[Q1]], [[Q3]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <8 x i16>, <8 x i16>* %p + %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) + store <8 x i16> %tmp, <8 x i16>* %p + ret void +} + +define void @test_v1i32_zero_undef(<1 x i32>* %p) { +; CHECK-LABEL: test_v1i32_zero_undef: + %a = load <1 x i32>, <1 x i32>* %p + %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true) + store <1 x i32> %tmp, <1 x i32>* %p + ret void +} + +define void @test_v2i32_zero_undef(<2 x i32>* %p) { +; CHECK-LABEL: test_v2i32_zero_undef: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vneg.s32 [[D2:d[0-9]+]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D2]] +; CHECK: vmov.i32 [[D3:d[0-9]+]], #0x1f +; CHECK: vclz.i32 [[D1]], [[D1]] +; CHECK: vsub.i32 [[D1]], [[D3]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <2 x i32>, <2 x i32>* %p + %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) + store <2 x i32> %tmp, <2 x i32>* %p + ret void +} + +define void @test_v4i32_zero_undef(<4 x i32>* %p) { +; CHECK-LABEL: test_v4i32_zero_undef: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vneg.s32 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q2]] +; CHECK: vmov.i32 [[Q3:q[0-9]+]], #0x1f +; CHECK: vclz.i32 [[Q1]], [[Q1]] +; CHECK: vsub.i32 [[Q1]], [[Q3]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <4 x i32>, <4 x i32>* %p + %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) + store <4 x i32> %tmp, <4 x i32>* %p + ret void +} + +define void @test_v1i64_zero_undef(<1 x i64>* %p) { +; CHECK-LABEL: test_v1i64_zero_undef: +; CHECK: vldr [[D1:d[0-9]+]], [r0] +; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0 +; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff +; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]] +; CHECK: vand [[D1]], [[D1]], [[D2]] +; CHECK: vadd.i64 [[D1]], [[D1]], [[D3]] +; CHECK: vcnt.8 [[D1]], [[D1]] +; CHECK: vpaddl.u8 [[D1]], [[D1]] +; CHECK: vpaddl.u16 [[D1]], [[D1]] +; CHECK: vpaddl.u32 [[D1]], [[D1]] +; CHECK: vstr [[D1]], [r0] + %a = load <1 x i64>, <1 x i64>* %p + %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true) + store <1 x i64> %tmp, <1 x i64>* %p + ret void +} + +define void @test_v2i64_zero_undef(<2 x i64>* %p) { +; CHECK-LABEL: test_v2i64_zero_undef: +; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] +; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0 +; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff +; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]] +; CHECK: vand [[Q1]], [[Q1]], [[Q2]] +; CHECK: vadd.i64 [[Q1]], [[Q1]], [[Q3]] +; CHECK: vcnt.8 [[Q1]], [[Q1]] +; CHECK: vpaddl.u8 [[Q1]], [[Q1]] +; CHECK: vpaddl.u16 [[Q1]], [[Q1]] +; CHECK: vpaddl.u32 [[Q1]], [[Q1]] +; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] + %a = load <2 x i64>, <2 x i64>* %p + %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) + store <2 x i64> %tmp, <2 x i64>* %p + ret void +} Index: test/CodeGen/ARM/ctz.ll =================================================================== --- test/CodeGen/ARM/ctz.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s - -declare i32 @llvm.cttz.i32(i32, i1) - -define i32 @f1(i32 %a) { -; CHECK-LABEL: f1: -; CHECK: rbit -; CHECK: clz - %tmp = call i32 @llvm.cttz.i32( i32 %a, i1 true ) - ret i32 %tmp -}