Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -13633,6 +13633,33 @@ Known.One &= Mask; return; } + case ARMISD::VGETLANEs: + case ARMISD::VGETLANEu: { + const SDValue &SrcSV = Op.getOperand(0); + EVT VecVT = SrcSV.getValueType(); + assert(VecVT.isVector() && "VGETLANE expected a vector type"); + const unsigned NumSrcElts = VecVT.getVectorNumElements(); + ConstantSDNode *Pos = cast(Op.getOperand(1).getNode()); + assert(Pos->getAPIntValue().ult(NumSrcElts) && + "VGETLANE index out of bounds"); + unsigned Idx = Pos->getZExtValue(); + APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); + Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); + + EVT VT = Op.getValueType(); + const unsigned DstSz = VT.getScalarSizeInBits(); + const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); + assert(SrcSz == Known.getBitWidth()); + assert(DstSz > SrcSz); + if (Op.getOpcode() == ARMISD::VGETLANEs) + Known = Known.sext(DstSz); + else { + Known = Known.zext(DstSz); + Known.Zero.setBitsFrom(SrcSz); + } + assert(DstSz == Known.getBitWidth()); + break; + } } } Index: llvm/trunk/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll +++ llvm/trunk/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll @@ -1,50 +1,52 @@ ; RUN: llc -mtriple armv7 %s -o - | FileCheck %s -; CHECK-LABEL: f: define float @f(<4 x i16>* nocapture %in) { - ; CHECK: vld1 - ; CHECK: vmovl.u16 - ; CHECK-NOT: vand +; CHECK-LABEL: f: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.16 {d16}, [r0:64] +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vcvt.f32.u32 q0, q8 +; CHECK-NEXT: vadd.f32 s4, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s4, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr %1 = load <4 x i16>, <4 x i16>* %in - ; CHECK: vcvt.f32.u32 %2 = uitofp <4 x i16> %1 to <4 x float> %3 = extractelement <4 x float> %2, i32 0 %4 = extractelement <4 x float> %2, i32 1 %5 = extractelement <4 x float> %2, i32 2 - ; CHECK: vadd.f32 %6 = fadd float %3, %4 %7 = fadd float %6, %5 ret float %7 } -; CHECK-LABEL: g: define float @g(<4 x i16>* nocapture %in) { - ; CHECK: vldr +; CHECK-LABEL: g: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcvt.f32.u32 s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr %1 = load <4 x i16>, <4 x i16>* %in - - ; For now we're generating a vmov.16 and a uxth instruction. - ; The uxth is redundant, and we should be able to extend without - ; having to generate cross-domain copies. Once we can do this - ; we should modify the checks below. - - ; CHECK: uxth %2 = extractelement <4 x i16> %1, i32 0 - ; CHECK: vcvt.f32.u32 %3 = uitofp i16 %2 to float ret float %3 } ; Make sure we generate zext from <4 x i8> to <4 x 32>. - -; CHECK-LABEL: h: -; CHECK: vld1.32 -; CHECK: vmovl.u8 q8, d16 -; CHECK: vmovl.u16 q8, d16 -; CHECK: vmov r0, r1, d16 -; CHECK: vmov r2, r3, d17 define <4 x i32> @h(<4 x i8> *%in) { +; CHECK-LABEL: h: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %1 = load <4 x i8>, <4 x i8>* %in, align 4 %2 = extractelement <4 x i8> %1, i32 0 %3 = zext i8 %2 to i32 @@ -60,3 +62,79 @@ %13 = insertelement <4 x i32> %10, i32 %12, i32 3 ret <4 x i32> %13 } + +define float @i(<4 x i16>* nocapture %in) { + ; FIXME: The vmov.u + sxt can convert to a vmov.s +; CHECK-LABEL: i: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcvt.f32.s32 s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr + %1 = load <4 x i16>, <4 x i16>* %in + %2 = extractelement <4 x i16> %1, i32 0 + %3 = sitofp i16 %2 to float + ret float %3 +} + +define float @j(<8 x i8>* nocapture %in) { +; CHECK-LABEL: j: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.u8 r0, d16[7] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcvt.f32.u32 s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr + %1 = load <8 x i8>, <8 x i8>* %in + %2 = extractelement <8 x i8> %1, i32 7 + %3 = uitofp i8 %2 to float + ret float %3 +} + +define float @k(<8 x i8>* nocapture %in) { +; FIXME: The vmov.u + sxt can convert to a vmov.s +; CHECK-LABEL: k: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.u8 r0, d16[7] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcvt.f32.s32 s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr + %1 = load <8 x i8>, <8 x i8>* %in + %2 = extractelement <8 x i8> %1, i32 7 + %3 = sitofp i8 %2 to float + ret float %3 +} + +define float @KnownUpperZero(<4 x i16> %v) { +; FIXME: uxtb are not required +; CHECK-LABEL: KnownUpperZero: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i16 d16, #0x3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.u16 r1, d16[3] +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: uxtb r0, r1 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vcvt.f32.s32 s0, s0 +; CHECK-NEXT: vcvt.f32.s32 s2, s2 +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr + %1 = and <4 x i16> %v, + %2 = extractelement <4 x i16> %1, i32 3 + %3 = extractelement <4 x i16> %1, i32 0 + %sinf1 = sitofp i16 %2 to float + %sinf2 = sitofp i16 %3 to float + %sum = fadd float %sinf1, %sinf2 + ret float %sum +}