Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -431,6 +431,10 @@ bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + /// \brief Returns true if an argument of type Ty needs to be passed in a /// contiguous block of registers in calling convention CallConv. bool functionArgumentNeedsConsecutiveRegisters( Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -12899,6 +12899,14 @@ return true; } +bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Index: test/CodeGen/ARM/vext.ll =================================================================== --- test/CodeGen/ARM/vext.ll +++ test/CodeGen/ARM/vext.ll @@ -134,28 +134,34 @@ ret <4 x i16> %tmp3 } -; We should ignore a build_vector with more than two sources. -; Use illegal <32 x i16> type to produce such a shuffle after legalizing types. -; Try to look for fallback to by-element inserts. +; FIXME: Lower this more efficiently. (Given an arbitrary <32 x i16>, I think +; the most efficient lowering is three vext shuffles.) define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind { ;CHECK-LABEL: test_multisource: -;CHECK: vmov.16 [[REG:d[0-9]+]][0] -;CHECK: vmov.16 [[REG]][1] -;CHECK: vmov.16 [[REG]][2] -;CHECK: vmov.16 [[REG]][3] +;CHECK: vld1.16 +;CHECK-NEXT: vld1.64 +;CHECK-NEXT: vld1.64 +;CHECK-NEXT: vld1.64 +;CHECK-NEXT: vorr +;CHECK-NEXT: vzip.16 +;CHECK-NEXT: vext.16 +;CHECK-NEXT: vtrn.16 +;CHECK-NEXT: vext.16 +;CHECK-NEXT: vext.16 +;CHECK-NEXT: vmov r0, r1 %tmp1 = load <32 x i16>, <32 x i16>* %B %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 } -; We don't handle shuffles using more than half of a 128-bit vector. -; Again, test for fallback to by-element inserts. +; If we split the operand into two <4 x i16> vectors, this becomes +; a vuzp. define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind { ;CHECK-LABEL: test_largespan: -;CHECK: vmov.16 [[REG:d[0-9]+]][0] -;CHECK: vmov.16 [[REG]][1] -;CHECK: vmov.16 [[REG]][2] -;CHECK: vmov.16 [[REG]][3] +;CHECK: vld1.64 +;CHECK-NEXT: vorr +;CHECK-NEXT: vuzp.16 +;CHECK-NEXT: vmov r0, r1 %tmp1 = load <8 x i16>, <8 x i16>* %B %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 Index: test/CodeGen/ARM/vpadd.ll =================================================================== --- test/CodeGen/ARM/vpadd.ll +++ test/CodeGen/ARM/vpadd.ll @@ -138,20 +138,36 @@ ret <2 x i64> %tmp2 } -; Test AddCombine optimization that generates a vpaddl.s -define void @addCombineToVPADDL() nounwind ssp { -; CHECK: vpaddl.s8 - %cbcr = alloca <16 x i8>, align 16 - %X = alloca <8 x i8>, align 8 +; Combine vuzp+vadd->vpadd. +; FIXME: Implement this optimization +define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp { +; CHECK-LABEL: addCombineToVPADD: +; CHECK: vuzp.8 +; CHECK: vadd.i8 %tmp = load <16 x i8>, <16 x i8>* %cbcr %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> - %tmp2 = load <16 x i8>, <16 x i8>* %cbcr - %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> + %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> %add = add <8 x i8> %tmp3, %tmp1 store <8 x i8> %add, <8 x i8>* %X, align 8 ret void } +; Combine vuzp+vaddl->vpaddl +; FIXME: Implement this optimization. +define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp { +; CHECK-LABEL: addCombineToVPADDL_sext: +; CHECK: vuzp.8 +; CHECK: vaddl.s8 + %tmp = load <16 x i8>, <16 x i8>* %cbcr + %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> + %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> + %tmp4 = sext <8 x i8> %tmp3 to <8 x i16> + %tmp5 = sext <8 x i8> %tmp1 to <8 x i16> + %add = add <8 x i16> %tmp4, %tmp5 + store <8 x i16> %add, <8 x i16>* %X, align 8 + ret void +} + ; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from ; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s. define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) { Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -6,14 +6,14 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vuzp.8 d17, d16 -; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmul.i8 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> - %tmp5 = add <8 x i8> %tmp3, %tmp4 + %tmp5 = mul <8 x i8> %tmp3, %tmp4 ret <8 x i8> %tmp5 } @@ -38,14 +38,14 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vuzp.16 d17, d16 -; CHECK-NEXT: vadd.i16 d16, d17, d16 +; CHECK-NEXT: vmul.i16 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> - %tmp5 = add <4 x i16> %tmp3, %tmp4 + %tmp5 = mul <4 x i16> %tmp3, %tmp4 ret <4 x i16> %tmp5 } @@ -206,14 +206,14 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vuzp.8 d17, d16 -; CHECK-NEXT: vadd.i8 d16, d17, d16 +; CHECK-NEXT: vmul.i8 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> - %tmp5 = add <8 x i8> %tmp3, %tmp4 + %tmp5 = mul <8 x i8> %tmp3, %tmp4 ret <8 x i8> %tmp5 } @@ -370,3 +370,20 @@ %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 ret <10 x i8> %rv } + +%struct.uint8x8x2_t = type { [2 x <8 x i8>] } +define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { +; CHECK-LABEL: vuzp_extract_subvector +; CHECK: vmov +; CHECK-NEXT: vmov +; CHECK-NEXT: vorr +; CHECK-NEXT: vuzp.8 +; CHECK-NEXT: vmov +; CHECK-NEXT: vmov + + %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> + %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> + %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 + ret %struct.uint8x8x2_t %.fca.0.1.insert +} Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -309,8 +309,11 @@ define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) { entry: ; CHECK-LABEL: vzip_vext_factor - ; CHECK: vext.16 d16, d16, d17, #3 - ; CHECK: vzip + ; CHECK: vld1.64 + ; CHECK-NEXT: vext.16 + ; CHECK-NEXT: vext.16 + ; CHECK-NEXT: vext.16 + ; CHECK-NEXT: vstr %tmp1 = load <8 x i16>, <8 x i16>* %A %0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> store <4 x i16> %0, <4 x i16>* %B