Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -6053,6 +6053,9 @@ unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatUndef.isAllOnesValue()) + return DAG.getUNDEF(VT); + if (SplatBitSize <= 64) { // Check if an immediate VMOV works. EVT VmovVT; @@ -6214,6 +6217,24 @@ return shuffle; } + if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { + // If we haven't found an efficient lowering, try splitting a 128-bit vector + // into two 64-bit vectors; we might discover a better way to lower it. + SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElts); + EVT ExtVT = VT.getVectorElementType(); + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); + SDValue Lower = + DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); + if (Lower.getOpcode() == ISD::BUILD_VECTOR) + Lower = LowerBUILD_VECTOR(Lower, DAG, ST); + SDValue Upper = DAG.getBuildVector( + HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); + if (Upper.getOpcode() == ISD::BUILD_VECTOR) + Upper = LowerBUILD_VECTOR(Upper, DAG, ST); + if (Lower && Upper) + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); + } + // Vectors with 32- or 64-bit elements can be built by directly assigning // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands // will be legalized. Index: llvm/trunk/test/CodeGen/ARM/big-endian-vector-callee.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/big-endian-vector-callee.ll +++ llvm/trunk/test/CodeGen/ARM/big-endian-vector-callee.ll @@ -652,10 +652,10 @@ ; CHECK-LABEL: test_v2f64_f128: define <2 x double> @test_v2f64_f128(fp128 %p) { -; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 -; CHECK: vmov.32 [[REG1]][1], r1 ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2 +; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 ; CHECK: vmov.32 [[REG2]][1], r3 +; CHECK: vmov.32 [[REG1]][1], r1 %1 = fadd fp128 %p, %p %2 = bitcast fp128 %1 to <2 x double> %3 = fadd <2 x double> %2, %2 @@ -747,10 +747,10 @@ ; CHECK-LABEL: test_v2i64_f128: define <2 x i64> @test_v2i64_f128(fp128 %p) { -; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 -; CHECK: vmov.32 [[REG1]][1], r1 ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2 +; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 ; CHECK: vmov.32 [[REG2]][1], r3 +; CHECK: vmov.32 [[REG1]][1], r1 %1 = fadd fp128 %p, %p %2 = bitcast fp128 %1 to <2 x i64> %3 = add <2 x i64> %2, %2 @@ -827,10 +827,10 @@ ; CHECK-LABEL: test_v4f32_f128: define <4 x float> @test_v4f32_f128(fp128 %p) { -; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 -; CHECK: vmov.32 [[REG1]][1], r1 ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2 +; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 ; CHECK: vmov.32 [[REG2]][1], r3 +; CHECK: vmov.32 [[REG1]][1], r1 %1 = fadd fp128 %p, %p %2 = bitcast fp128 %1 to <4 x float> %3 = fadd <4 x float> %2, %2 @@ -909,10 +909,10 @@ ; CHECK-LABEL: test_v4i32_f128: define <4 x i32> @test_v4i32_f128(fp128 %p) { -; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 -; CHECK: vmov.32 [[REG1]][1], r1 ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2 +; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 ; CHECK: vmov.32 [[REG2]][1], r3 +; CHECK: vmov.32 [[REG1]][1], r1 %1 = fadd fp128 %p, %p %2 = bitcast fp128 %1 to <4 x i32> %3 = add <4 x i32> %2, %2 @@ -997,10 +997,10 @@ ; CHECK-LABEL: test_v8i16_f128: define <8 x i16> @test_v8i16_f128(fp128 %p) { -; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 -; CHECK: vmov.32 [[REG1]][1], r1 ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2 +; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 ; CHECK: vmov.32 [[REG2]][1], r3 +; CHECK: vmov.32 [[REG1]][1], r1 %1 = fadd fp128 %p, %p %2 = bitcast fp128 %1 to <8 x i16> %3 = add <8 x i16> %2, %2 @@ -1085,10 +1085,10 @@ ; CHECK-LABEL: test_v16i8_f128: define <16 x i8> @test_v16i8_f128(fp128 %p) { -; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 -; CHECK: vmov.32 [[REG1]][1], r1 ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2 +; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0 ; CHECK: vmov.32 [[REG2]][1], r3 +; CHECK: vmov.32 [[REG1]][1], r1 %1 = fadd fp128 %p, %p %2 = bitcast fp128 %1 to <16 x i8> %3 = add <16 x i8> %2, %2 Index: llvm/trunk/test/CodeGen/ARM/vcombine.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vcombine.ll +++ llvm/trunk/test/CodeGen/ARM/vcombine.ll @@ -105,3 +105,21 @@ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 } + +; vcombine(vld1_dup(p), vld1_dup(p2)) +define <8 x i16> @vcombine_vdup(<8 x i16> %src, i16* nocapture readonly %p) { +; CHECK-LABEL: vcombine_vdup: +; CHECK: vld1.16 {d16[]}, +; CHECK: vld1.16 {d17[]}, +; CHECK-LE: vmov r0, r1, d16 +; CHECK-LE: vmov r2, r3, d17 + %a1 = load i16, i16* %p, align 2 + %a2 = insertelement <4 x i16> undef, i16 %a1, i32 0 + %a3 = shufflevector <4 x i16> %a2, <4 x i16> undef, <4 x i32> zeroinitializer + %p2 = getelementptr inbounds i16, i16* %p, i32 1 + %b1 = load i16, i16* %p2, align 2 + %b2 = insertelement <4 x i16> undef, i16 %b1, i32 0 + %b3 = shufflevector <4 x i16> %b2, <4 x i16> undef, <4 x i32> zeroinitializer + %shuffle = shufflevector <4 x i16> %a3, <4 x i16> %b3, <8 x i32> + ret <8 x i16> %shuffle +} Index: llvm/trunk/test/CodeGen/ARM/vext.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vext.ll +++ llvm/trunk/test/CodeGen/ARM/vext.ll @@ -164,16 +164,25 @@ ; The actual shuffle code only handles some cases, make sure we check ; this rather than blindly emitting a VECTOR_SHUFFLE (infinite ; lowering loop can result otherwise). +; (There are probably better ways to lower this shuffle, but it's not +; really important.) define <8 x i16> @test_illegal(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: test_illegal: -;CHECK: vmov.16 [[REG:d[0-9]+]][0] -;CHECK: vmov.16 [[REG]][1] -;CHECK: vmov.16 [[REG]][2] -;CHECK: vmov.16 [[REG]][3] -;CHECK: vmov.16 [[REG2:d[0-9]+]][0] -;CHECK: vmov.16 [[REG2]][1] -;CHECK: vmov.16 [[REG2]][2] -;CHECK: vmov.16 [[REG2]][3] +;CHECK: vmov.u16 +;CHECK-NEXT: vmov.u16 +;CHECK-NEXT: vorr +;CHECK-NEXT: vorr +;CHECK-NEXT: vmov.16 +;CHECK-NEXT: vuzp.16 +;CHECK-NEXT: vmov.u16 +;CHECK-NEXT: vmov.16 +;CHECK-NEXT: vuzp.16 +;CHECK-NEXT: vmov.16 +;CHECK-NEXT: vmov.u16 +;CHECK-NEXT: vext.16 +;CHECK-NEXT: vmov.16 +;CHECK-NEXT: vmov r0, r1, d +;CHECK-NEXT: vmov r2, r3, d %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> Index: llvm/trunk/test/CodeGen/ARM/vtrn.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vtrn.ll +++ llvm/trunk/test/CodeGen/ARM/vtrn.ll @@ -372,13 +372,18 @@ ret <8 x i8> %rv } -; Negative test that should not generate a vtrn +; The shuffle mask is half a vtrn; we duplicate the half to produce the +; full result. define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) { entry: ; CHECK-LABEL: lower_twice_no_vtrn ; CHECK: @ BB#0: - ; CHECK-NOT: vtrn - ; CHECK: mov pc, lr + ; CHECK-NEXT: vldr d16, [r1] + ; CHECK-NEXT: vldr d18, [r0] + ; CHECK-NEXT: vtrn.16 d18, d16 + ; CHECK-NEXT: vorr d17, d16, d16 + ; CHECK-NEXT: vst1.64 {d16, d17}, [r2] + ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> @@ -386,13 +391,18 @@ ret void } -; Negative test that should not generate a vtrn +; The shuffle mask is half a vtrn; we duplicate the half to produce the +; full result. define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) { entry: ; CHECK-LABEL: upper_twice_no_vtrn ; CHECK: @ BB#0: - ; CHECK-NOT: vtrn - ; CHECK: mov pc, lr + ; CHECK-NEXT: vldr d16, [r1] + ; CHECK-NEXT: vldr d18, [r0] + ; CHECK-NEXT: vtrn.16 d18, d16 + ; CHECK-NEXT: vorr d19, d18, d18 + ; CHECK-NEXT: vst1.64 {d18, d19}, [r2] + ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32>