Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -5715,6 +5715,44 @@ .getValue(WhichResult); } + // Also check for these shuffles through CONCAT_VECTORS: we canonicalize + // shuffles that produce a result larger than their operands with: + // shuffle(concat(v1, undef), concat(v2, undef)) + // -> + // shuffle(concat(v1, v2), undef) + // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). + // + // This is useful in the general case, but there are special cases where + // native shuffles produce larger results: the two-result ops. + // + // Look through the concat when lowering them: + // shuffle(concat(v1, v2), undef) + // -> + // concat(VZIP(v1, v2):0, :1) + // + if (V1->getOpcode() == ISD::CONCAT_VECTORS && + V2->getOpcode() == ISD::UNDEF) { + SDValue SubV1 = V1->getOperand(0); + SDValue SubV2 = V1->getOperand(1); + EVT SubVT = SubV1.getValueType(); + + // We expect these to have been canonicalized to -1. + assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { + return i < (int)VT.getVectorNumElements(); + }) && "Unexpected shuffle index into UNDEF operand!"); + + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + SubV2 = SubV1; + assert((WhichResult == 0) && + "In-place shuffle of concat can only have one result!"); + SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), + SubV1, SubV2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), + Res.getValue(1)); + } + } } // If the shuffle is not directly supported and it has 4 elements, use Index: llvm/trunk/test/CodeGen/ARM/vtrn.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vtrn.ll +++ llvm/trunk/test/CodeGen/ARM/vtrn.ll @@ -20,40 +20,9 @@ define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d19, [r0] -; CHECK-NEXT: vldr d18, [r1] -; CHECK-NEXT: vmov.u8 r0, d19[0] -; CHECK-NEXT: vmov.8 d16[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[0] -; CHECK-NEXT: vmov.8 d16[1], r0 -; CHECK-NEXT: vmov.u8 r0, d19[2] -; CHECK-NEXT: vmov.8 d16[2], r0 -; CHECK-NEXT: vmov.u8 r0, d18[2] -; CHECK-NEXT: vmov.8 d16[3], r0 -; CHECK-NEXT: vmov.u8 r0, d19[4] -; CHECK-NEXT: vmov.8 d16[4], r0 -; CHECK-NEXT: vmov.u8 r0, d18[4] -; CHECK-NEXT: vmov.8 d16[5], r0 -; CHECK-NEXT: vmov.u8 r0, d19[6] -; CHECK-NEXT: vmov.8 d16[6], r0 -; CHECK-NEXT: vmov.u8 r0, d18[6] -; CHECK-NEXT: vmov.8 d16[7], r0 -; CHECK-NEXT: vmov.u8 r0, d19[1] -; CHECK-NEXT: vmov.8 d17[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[1] -; CHECK-NEXT: vmov.8 d17[1], r0 -; CHECK-NEXT: vmov.u8 r0, d19[3] -; CHECK-NEXT: vmov.8 d17[2], r0 -; CHECK-NEXT: vmov.u8 r0, d18[3] -; CHECK-NEXT: vmov.8 d17[3], r0 -; CHECK-NEXT: vmov.u8 r0, d19[5] -; CHECK-NEXT: vmov.8 d17[4], r0 -; CHECK-NEXT: vmov.u8 r0, d18[5] -; CHECK-NEXT: vmov.8 d17[5], r0 -; CHECK-NEXT: vmov.u8 r0, d19[7] -; CHECK-NEXT: vmov.8 d17[6], r0 -; CHECK-NEXT: vmov.u8 r0, d18[7] -; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vtrn.8 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -83,26 +52,11 @@ define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vtrni16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vmov.u16 r0, d16[0] -; CHECK-NEXT: vmov.16 d18[0], r0 -; CHECK-NEXT: vmov.u16 r0, d17[0] -; CHECK-NEXT: vmov.16 d18[1], r0 -; CHECK-NEXT: vmov.u16 r0, d16[2] -; CHECK-NEXT: vmov.16 d18[2], r0 -; CHECK-NEXT: vmov.u16 r0, d17[2] -; CHECK-NEXT: vmov.16 d18[3], r0 -; CHECK-NEXT: vmov.u16 r0, d16[1] -; CHECK-NEXT: vmov.16 d19[0], r0 -; CHECK-NEXT: vmov.u16 r0, d17[1] -; CHECK-NEXT: vmov.16 d19[1], r0 -; CHECK-NEXT: vmov.u16 r0, d16[3] -; CHECK-NEXT: vmov.16 d19[2], r0 -; CHECK-NEXT: vmov.u16 r0, d17[3] -; CHECK-NEXT: vmov.16 d19[3], r0 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vtrn.16 d16, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -132,8 +86,7 @@ ; CHECK: @ BB#0: ; CHECK-NEXT: vldr d17, [r1] ; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vrev64.32 q9, q8 -; CHECK-NEXT: vuzp.32 q8, q9 +; CHECK-NEXT: vtrn.32 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -165,8 +118,7 @@ ; CHECK: @ BB#0: ; CHECK-NEXT: vldr d17, [r1] ; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vrev64.32 q9, q8 -; CHECK-NEXT: vuzp.32 q8, q9 +; CHECK-NEXT: vtrn.32 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -329,32 +281,9 @@ define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d18, [r0] -; CHECK-NEXT: vldr d19, [r1] -; CHECK-NEXT: vmov.u8 r0, d18[0] -; CHECK-NEXT: vmov.8 d16[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[2] -; CHECK-NEXT: vmov.8 d16[2], r0 -; CHECK-NEXT: vmov.u8 r0, d19[2] -; CHECK-NEXT: vmov.8 d16[3], r0 -; CHECK-NEXT: vmov.u8 r0, d19[4] -; CHECK-NEXT: vmov.8 d16[5], r0 -; CHECK-NEXT: vmov.u8 r0, d18[6] -; CHECK-NEXT: vmov.8 d16[6], r0 -; CHECK-NEXT: vmov.u8 r0, d19[6] -; CHECK-NEXT: vmov.8 d16[7], r0 -; CHECK-NEXT: vmov.u8 r0, d18[1] -; CHECK-NEXT: vmov.8 d17[0], r0 -; CHECK-NEXT: vmov.u8 r0, d19[1] -; CHECK-NEXT: vmov.8 d17[1], r0 -; CHECK-NEXT: vmov.u8 r0, d18[3] -; CHECK-NEXT: vmov.8 d17[2], r0 -; CHECK-NEXT: vmov.u8 r0, d19[3] -; CHECK-NEXT: vmov.8 d17[3], r0 -; CHECK-NEXT: vmov.u8 r0, d18[5] -; CHECK-NEXT: vmov.8 d17[4], r0 -; CHECK-NEXT: vmov.u8 r0, d19[7] -; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vtrn.8 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr Index: llvm/trunk/test/CodeGen/ARM/vuzp.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vuzp.ll +++ llvm/trunk/test/CodeGen/ARM/vuzp.ll @@ -20,40 +20,9 @@ define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d19, [r0] -; CHECK-NEXT: vldr d18, [r1] -; CHECK-NEXT: vmov.u8 r0, d19[0] -; CHECK-NEXT: vmov.8 d16[0], r0 -; CHECK-NEXT: vmov.u8 r0, d19[2] -; CHECK-NEXT: vmov.8 d16[1], r0 -; CHECK-NEXT: vmov.u8 r0, d19[4] -; CHECK-NEXT: vmov.8 d16[2], r0 -; CHECK-NEXT: vmov.u8 r0, d19[6] -; CHECK-NEXT: vmov.8 d16[3], r0 -; CHECK-NEXT: vmov.u8 r0, d18[0] -; CHECK-NEXT: vmov.8 d16[4], r0 -; CHECK-NEXT: vmov.u8 r0, d18[2] -; CHECK-NEXT: vmov.8 d16[5], r0 -; CHECK-NEXT: vmov.u8 r0, d18[4] -; CHECK-NEXT: vmov.8 d16[6], r0 -; CHECK-NEXT: vmov.u8 r0, d18[6] -; CHECK-NEXT: vmov.8 d16[7], r0 -; CHECK-NEXT: vmov.u8 r0, d19[1] -; CHECK-NEXT: vmov.8 d17[0], r0 -; CHECK-NEXT: vmov.u8 r0, d19[3] -; CHECK-NEXT: vmov.8 d17[1], r0 -; CHECK-NEXT: vmov.u8 r0, d19[5] -; CHECK-NEXT: vmov.8 d17[2], r0 -; CHECK-NEXT: vmov.u8 r0, d19[7] -; CHECK-NEXT: vmov.8 d17[3], r0 -; CHECK-NEXT: vmov.u8 r0, d18[1] -; CHECK-NEXT: vmov.8 d17[4], r0 -; CHECK-NEXT: vmov.u8 r0, d18[3] -; CHECK-NEXT: vmov.8 d17[5], r0 -; CHECK-NEXT: vmov.u8 r0, d18[5] -; CHECK-NEXT: vmov.8 d17[6], r0 -; CHECK-NEXT: vmov.u8 r0, d18[7] -; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vuzp.8 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -83,26 +52,11 @@ define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vuzpi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vmov.u16 r0, d16[0] -; CHECK-NEXT: vmov.16 d18[0], r0 -; CHECK-NEXT: vmov.u16 r0, d16[2] -; CHECK-NEXT: vmov.16 d18[1], r0 -; CHECK-NEXT: vmov.u16 r0, d17[0] -; CHECK-NEXT: vmov.16 d18[2], r0 -; CHECK-NEXT: vmov.u16 r0, d17[2] -; CHECK-NEXT: vmov.16 d18[3], r0 -; CHECK-NEXT: vmov.u16 r0, d16[1] -; CHECK-NEXT: vmov.16 d19[0], r0 -; CHECK-NEXT: vmov.u16 r0, d16[3] -; CHECK-NEXT: vmov.16 d19[1], r0 -; CHECK-NEXT: vmov.u16 r0, d17[1] -; CHECK-NEXT: vmov.16 d19[2], r0 -; CHECK-NEXT: vmov.u16 r0, d17[3] -; CHECK-NEXT: vmov.16 d19[3], r0 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vuzp.16 d16, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -266,32 +220,9 @@ define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d18, [r0] -; CHECK-NEXT: vldr d19, [r1] -; CHECK-NEXT: vmov.u8 r0, d18[0] -; CHECK-NEXT: vmov.8 d16[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[2] -; CHECK-NEXT: vmov.8 d16[1], r0 -; CHECK-NEXT: vmov.u8 r0, d19[0] -; CHECK-NEXT: vmov.8 d16[4], r0 -; CHECK-NEXT: vmov.u8 r0, d19[2] -; CHECK-NEXT: vmov.8 d16[5], r0 -; CHECK-NEXT: vmov.u8 r0, d19[4] -; CHECK-NEXT: vmov.8 d16[6], r0 -; CHECK-NEXT: vmov.u8 r0, d19[6] -; CHECK-NEXT: vmov.8 d16[7], r0 -; CHECK-NEXT: vmov.u8 r0, d18[1] -; CHECK-NEXT: vmov.8 d17[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[3] -; CHECK-NEXT: vmov.8 d17[1], r0 -; CHECK-NEXT: vmov.u8 r0, d18[5] -; CHECK-NEXT: vmov.8 d17[2], r0 -; CHECK-NEXT: vmov.u8 r0, d18[7] -; CHECK-NEXT: vmov.8 d17[3], r0 -; CHECK-NEXT: vmov.u8 r0, d19[5] -; CHECK-NEXT: vmov.8 d17[6], r0 -; CHECK-NEXT: vmov.u8 r0, d19[7] -; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vuzp.8 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr Index: llvm/trunk/test/CodeGen/ARM/vzip.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vzip.ll +++ llvm/trunk/test/CodeGen/ARM/vzip.ll @@ -20,40 +20,9 @@ define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d19, [r0] -; CHECK-NEXT: vldr d18, [r1] -; CHECK-NEXT: vmov.u8 r0, d19[0] -; CHECK-NEXT: vmov.8 d16[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[0] -; CHECK-NEXT: vmov.8 d16[1], r0 -; CHECK-NEXT: vmov.u8 r0, d19[1] -; CHECK-NEXT: vmov.8 d16[2], r0 -; CHECK-NEXT: vmov.u8 r0, d18[1] -; CHECK-NEXT: vmov.8 d16[3], r0 -; CHECK-NEXT: vmov.u8 r0, d19[2] -; CHECK-NEXT: vmov.8 d16[4], r0 -; CHECK-NEXT: vmov.u8 r0, d18[2] -; CHECK-NEXT: vmov.8 d16[5], r0 -; CHECK-NEXT: vmov.u8 r0, d19[3] -; CHECK-NEXT: vmov.8 d16[6], r0 -; CHECK-NEXT: vmov.u8 r0, d18[3] -; CHECK-NEXT: vmov.8 d16[7], r0 -; CHECK-NEXT: vmov.u8 r0, d19[4] -; CHECK-NEXT: vmov.8 d17[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[4] -; CHECK-NEXT: vmov.8 d17[1], r0 -; CHECK-NEXT: vmov.u8 r0, d19[5] -; CHECK-NEXT: vmov.8 d17[2], r0 -; CHECK-NEXT: vmov.u8 r0, d18[5] -; CHECK-NEXT: vmov.8 d17[3], r0 -; CHECK-NEXT: vmov.u8 r0, d19[6] -; CHECK-NEXT: vmov.8 d17[4], r0 -; CHECK-NEXT: vmov.u8 r0, d18[6] -; CHECK-NEXT: vmov.8 d17[5], r0 -; CHECK-NEXT: vmov.u8 r0, d19[7] -; CHECK-NEXT: vmov.8 d17[6], r0 -; CHECK-NEXT: vmov.u8 r0, d18[7] -; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vzip.8 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -83,26 +52,11 @@ define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vzipi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vmov.u16 r0, d16[0] -; CHECK-NEXT: vmov.16 d18[0], r0 -; CHECK-NEXT: vmov.u16 r0, d17[0] -; CHECK-NEXT: vmov.16 d18[1], r0 -; CHECK-NEXT: vmov.u16 r0, d16[1] -; CHECK-NEXT: vmov.16 d18[2], r0 -; CHECK-NEXT: vmov.u16 r0, d17[1] -; CHECK-NEXT: vmov.16 d18[3], r0 -; CHECK-NEXT: vmov.u16 r0, d16[2] -; CHECK-NEXT: vmov.16 d19[0], r0 -; CHECK-NEXT: vmov.u16 r0, d17[2] -; CHECK-NEXT: vmov.16 d19[1], r0 -; CHECK-NEXT: vmov.u16 r0, d16[3] -; CHECK-NEXT: vmov.16 d19[2], r0 -; CHECK-NEXT: vmov.u16 r0, d17[3] -; CHECK-NEXT: vmov.16 d19[3], r0 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vzip.16 d16, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -266,32 +220,9 @@ define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d18, [r0] -; CHECK-NEXT: vldr d19, [r1] -; CHECK-NEXT: vmov.u8 r0, d18[0] -; CHECK-NEXT: vmov.8 d16[0], r0 -; CHECK-NEXT: vmov.u8 r0, d18[1] -; CHECK-NEXT: vmov.8 d16[2], r0 -; CHECK-NEXT: vmov.u8 r0, d19[1] -; CHECK-NEXT: vmov.8 d16[3], r0 -; CHECK-NEXT: vmov.u8 r0, d19[2] -; CHECK-NEXT: vmov.8 d16[5], r0 -; CHECK-NEXT: vmov.u8 r0, d18[3] -; CHECK-NEXT: vmov.8 d16[6], r0 -; CHECK-NEXT: vmov.u8 r0, d19[3] -; CHECK-NEXT: vmov.8 d16[7], r0 -; CHECK-NEXT: vmov.u8 r0, d18[4] -; CHECK-NEXT: vmov.8 d17[0], r0 -; CHECK-NEXT: vmov.u8 r0, d19[4] -; CHECK-NEXT: vmov.8 d17[1], r0 -; CHECK-NEXT: vmov.u8 r0, d18[5] -; CHECK-NEXT: vmov.8 d17[2], r0 -; CHECK-NEXT: vmov.u8 r0, d19[5] -; CHECK-NEXT: vmov.8 d17[3], r0 -; CHECK-NEXT: vmov.u8 r0, d18[6] -; CHECK-NEXT: vmov.8 d17[4], r0 -; CHECK-NEXT: vmov.u8 r0, d19[7] -; CHECK-NEXT: vmov.8 d17[7], r0 +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vzip.8 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr