Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h @@ -365,6 +365,11 @@ bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8200,6 +8200,14 @@ return Shift < 3; } +bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: Index: llvm/trunk/test/CodeGen/AArch64/aarch64-vuzp.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/aarch64-vuzp.ll +++ llvm/trunk/test/CodeGen/AArch64/aarch64-vuzp.ll @@ -4,7 +4,6 @@ ; CHECK-LABEL: fun1: ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b -; CHECK-NOT: mov define i32 @fun1() { entry: %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) @@ -16,7 +15,6 @@ ; CHECK-LABEL: fun2: ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b -; CHECK-NOT: mov define i32 @fun2() { entry: %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) @@ -28,7 +26,6 @@ ; CHECK-LABEL: fun3: ; CHECK-NOT: uzp1 -; CHECK: mov define i32 @fun3() { entry: %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) @@ -40,7 +37,6 @@ ; CHECK-LABEL: fun4: ; CHECK-NOT: uzp2 -; CHECK: mov define i32 @fun4() { entry: %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> , <16 x i8> undef) Index: llvm/trunk/test/CodeGen/AArch64/arm64-ext.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-ext.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-ext.ll @@ -94,19 +94,6 @@ ; Tests for ReconstructShuffle function. Indices have to be carefully ; chosen to reach lowering phase as a BUILD_VECTOR. -; One vector needs vext, the other can be handled by extract_subvector -; Also checks interleaving of sources is handled correctly. -; Essence: a vext is used on %A and something saner than stack load/store for final result. -define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: test_interleaved: -;CHECK: ext.8b -;CHECK: zip1.4h - %tmp1 = load <8 x i16>, <8 x i16>* %A - %tmp2 = load <8 x i16>, <8 x i16>* %B - %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> - ret <4 x i16> %tmp3 -} - ; An undef in the shuffle list should still be optimizable define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: test_undef: Index: llvm/trunk/test/CodeGen/AArch64/neon-scalar-copy.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/neon-scalar-copy.ll +++ llvm/trunk/test/CodeGen/AArch64/neon-scalar-copy.ll @@ -79,7 +79,7 @@ define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) #0 { ; CHECK-LABEL: test_vector_dup_bv16B: - ; CHECK-NEXT: dup v0.16b, v0.b[14] + ; CHECK-NEXT: dup v0.8b, v0.b[14] ; CHECK-NEXT: ret %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> ret <1 x i8> %shuffle.i @@ -95,7 +95,7 @@ define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) #0 { ; CHECK-LABEL: test_vector_dup_hv8H: - ; CHECK-NEXT: dup v0.8h, v0.h[7] + ; CHECK-NEXT: dup v0.4h, v0.h[7] ; CHECK-NEXT: ret %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> ret <1 x i16> %shuffle.i @@ -111,7 +111,7 @@ define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) #0 { ; CHECK-LABEL: test_vector_dup_sv4S: - ; CHECK-NEXT: dup v0.4s, v0.s[3] + ; CHECK-NEXT: dup v0.2s, v0.s[3] ; CHECK-NEXT: ret %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> ret <1 x i32> %shuffle @@ -135,7 +135,7 @@ define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) #0 { ; CHECK-LABEL: test_vector_copy_dup_dv2D: - ; CHECK-NEXT: dup v0.2d, v1.d[1] + ; CHECK-NEXT: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8 ; CHECK-NEXT: ret %vget_lane = extractelement <2 x i64> %c, i32 1 %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0