Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.h +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h @@ -431,6 +431,10 @@ bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + /// \brief Returns true if an argument of type Ty needs to be passed in a /// contiguous block of registers in calling convention CallConv. bool functionArgumentNeedsConsecutiveRegisters( Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -12921,6 +12921,14 @@ return true; } +bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Index: llvm/trunk/test/CodeGen/ARM/vext.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vext.ll +++ llvm/trunk/test/CodeGen/ARM/vext.ll @@ -217,21 +217,19 @@ ; CHECK-LABEL: test_multisource: ; CHECK: @ BB#0: ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: add r2, r0, #48 -; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: add r2, r0, #32 +; CHECK-NEXT: add r0, r0, #48 ; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]! ; CHECK-NEXT: vld1.64 {d20, d21}, [r2:128] -; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128] -; CHECK-NEXT: vmov.u16 r1, d16[0] -; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] -; CHECK-NEXT: vmov.16 d22[0], r1 -; CHECK-NEXT: vmov.u16 r0, d18[0] -; CHECK-NEXT: vmov.u16 r1, d16[0] -; CHECK-NEXT: vmov.16 d22[1], r0 -; CHECK-NEXT: vmov.u16 r0, d20[0] -; CHECK-NEXT: vmov.16 d22[2], r1 -; CHECK-NEXT: vmov.16 d22[3], r0 -; CHECK-NEXT: vmov r0, r1, d22 +; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] +; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128] +; CHECK-NEXT: vorr d24, d20, d20 +; CHECK-NEXT: vzip.16 d24, d18 +; CHECK-NEXT: vext.16 d18, d20, d24, #2 +; CHECK-NEXT: vtrn.16 q8, q11 +; CHECK-NEXT: vext.16 d16, d18, d16, #2 +; CHECK-NEXT: vext.16 d16, d16, d16, #2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <32 x i16>, <32 x i16>* %B %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> @@ -244,14 +242,8 @@ ; CHECK-LABEL: test_largespan: ; CHECK: @ BB#0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vmov.u16 r1, d16[0] -; CHECK-NEXT: vmov.u16 r0, d16[2] -; CHECK-NEXT: vmov.16 d18[0], r1 -; CHECK-NEXT: vmov.u16 r1, d17[0] -; CHECK-NEXT: vmov.16 d18[1], r0 -; CHECK-NEXT: vmov.u16 r0, d17[2] -; CHECK-NEXT: vmov.16 d18[2], r1 -; CHECK-NEXT: vmov.16 d18[3], r0 +; CHECK-NEXT: vorr d18, d16, d16 +; CHECK-NEXT: vuzp.16 d18, d17 ; CHECK-NEXT: vmov r0, r1, d18 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %B Index: llvm/trunk/test/CodeGen/ARM/vpadd.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vpadd.ll +++ llvm/trunk/test/CodeGen/ARM/vpadd.ll @@ -213,36 +213,47 @@ ret <2 x i64> %tmp2 } -; Test AddCombine optimization that generates a vpaddl.s -define void @addCombineToVPADDL() nounwind ssp { -; CHECK-LABEL: addCombineToVPADDL: -; CHECK: @ BB#0: -; CHECK-NEXT: .save {r11} -; CHECK-NEXT: push {r11} -; CHECK-NEXT: .setfp r11, sp -; CHECK-NEXT: mov r11, sp -; CHECK-NEXT: .pad #44 -; CHECK-NEXT: sub sp, sp, #44 -; CHECK-NEXT: bic sp, sp, #15 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] -; CHECK-NEXT: vpaddl.s8 q8, q8 -; CHECK-NEXT: vmovn.i16 d16, q8 -; CHECK-NEXT: vstr d16, [sp, #8] -; CHECK-NEXT: mov sp, r11 -; CHECK-NEXT: pop {r11} +; Combine vuzp+vadd->vpadd. +; FIXME: Implement this optimization +define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp { +; CHECK-LABEL: addCombineToVPADD: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vorr d18, d17, d17 +; CHECK-NEXT: vuzp.8 d16, d18 +; CHECK-NEXT: vadd.i8 d16, d18, d16 +; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov pc, lr - %cbcr = alloca <16 x i8>, align 16 - %X = alloca <8 x i8>, align 8 %tmp = load <16 x i8>, <16 x i8>* %cbcr %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> - %tmp2 = load <16 x i8>, <16 x i8>* %cbcr - %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> + %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> + %add = add <8 x i8> %tmp3, %tmp1 store <8 x i8> %add, <8 x i8>* %X, align 8 ret void } +; Combine vuzp+vaddl->vpaddl +; FIXME: Implement this optimization. +define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp { +; CHECK-LABEL: addCombineToVPADDL_sext: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vorr d18, d17, d17 +; CHECK-NEXT: vuzp.8 d16, d18 +; CHECK-NEXT: vaddl.s8 q8, d18, d16 +; CHECK-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NEXT: mov pc, lr + %tmp = load <16 x i8>, <16 x i8>* %cbcr + %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> + %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> + %tmp4 = sext <8 x i8> %tmp3 to <8 x i16> + %tmp5 = sext <8 x i8> %tmp1 to <8 x i16> + %add = add <8 x i16> %tmp4, %tmp5 + store <8 x i16> %add, <8 x i16>* %X, align 8 + ret void +} + ; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from ; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s. define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) { Index: llvm/trunk/test/CodeGen/ARM/vuzp.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vuzp.ll +++ llvm/trunk/test/CodeGen/ARM/vuzp.ll @@ -7,14 +7,14 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vuzp.8 d17, d16 -; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmul.i8 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> - %tmp5 = add <8 x i8> %tmp3, %tmp4 + %tmp5 = mul <8 x i8> %tmp3, %tmp4 ret <8 x i8> %tmp5 } @@ -39,14 +39,14 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vuzp.16 d17, d16 -; CHECK-NEXT: vadd.i16 d16, d17, d16 +; CHECK-NEXT: vmul.i16 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> - %tmp5 = add <4 x i16> %tmp3, %tmp4 + %tmp5 = mul <4 x i16> %tmp3, %tmp4 ret <4 x i16> %tmp5 } @@ -207,14 +207,14 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vuzp.8 d17, d16 -; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmul.i8 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> - %tmp5 = add <8 x i8> %tmp3, %tmp4 + %tmp5 = mul <8 x i8> %tmp3, %tmp4 ret <8 x i8> %tmp5 } @@ -550,3 +550,22 @@ %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 ret <10 x i8> %rv } + +%struct.uint8x8x2_t = type { [2 x <8 x i8>] } +define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { +; CHECK-LABEL: vuzp_extract_subvector: +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vorr d18, d17, d17 +; CHECK-NEXT: vuzp.8 d16, d18 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d18 +; CHECK-NEXT: mov pc, lr + + %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> + %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> + %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 + ret %struct.uint8x8x2_t %.fca.0.1.insert +} Index: llvm/trunk/test/CodeGen/ARM/vzip.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vzip.ll +++ llvm/trunk/test/CodeGen/ARM/vzip.ll @@ -332,9 +332,8 @@ ; CHECK-LABEL: vzip_vext_factor: ; CHECK: @ BB#0: @ %entry ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vext.16 d16, d16, d17, #3 -; CHECK-NEXT: vext.16 d17, d16, d16, #1 -; CHECK-NEXT: vzip.16 d16, d17 +; CHECK-NEXT: vext.16 d18, d16, d17, #1 +; CHECK-NEXT: vext.16 d16, d18, d17, #2 ; CHECK-NEXT: vext.16 d16, d16, d16, #1 ; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov pc, lr