Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -5714,6 +5714,45 @@ return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) .getValue(WhichResult); } + + // Also check for these shuffles through CONCAT_VECTORS: we canonicalize + // shuffles that produce a result larger than their operands with: + // shuffle(concat(v1, undef), concat(v2, undef)) + // -> + // shuffle(concat(v1, v2), undef) + // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). + // + // This is useful in the general case, but there are special cases where + // native shuffles produce larger results: the two-result ops. + // + // Look through the concat when lowering them: + // shuffle(concat(v1, v2), undef) + // -> + // concat(VZIP(v1, v2):0, :1) + // + if (V1->getOpcode() == ISD::CONCAT_VECTORS && + V2->getOpcode() == ISD::UNDEF) { + SDValue SubV1 = V1->getOperand(0); + SDValue SubV2 = V1->getOperand(1); + EVT SubVT = SubV1.getValueType(); + + // We expect these to have been canonicalized to -1. + assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { + return i < (int)VT.getVectorNumElements(); + }) && "Unexpected shuffle index into UNDEF operand!"); + + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + SubV2 = SubV1; + assert((WhichResult == 0) && + "In-place shuffle of concat can only have one result!"); + SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), + SubV1, SubV2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), + Res.getValue(1)); + } + } } // If the shuffle is not directly supported and it has 4 elements, use Index: test/CodeGen/ARM/vtrn.ll =================================================================== --- test/CodeGen/ARM/vtrn.ll +++ test/CodeGen/ARM/vtrn.ll @@ -12,6 +12,15 @@ ret <8 x i8> %tmp5 } +define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vtrni8_Qres: +;CHECK: vtrn.8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK-LABEL: vtrni16: ;CHECK: vtrn.16 @@ -24,6 +33,15 @@ ret <4 x i16> %tmp5 } +define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vtrni16_Qres: +;CHECK: vtrn.16 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %tmp3 +} + define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK-LABEL: vtrni32: ;CHECK: vtrn.32 @@ -36,6 +54,15 @@ ret <2 x i32> %tmp5 } +define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK-LABEL: vtrni32_Qres: +;CHECK: vtrn.32 + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> + ret <4 x i32> %tmp3 +} + define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { ;CHECK-LABEL: vtrnf: ;CHECK: vtrn.32 @@ -48,6 +75,15 @@ ret <2 x float> %tmp5 } +define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK-LABEL: vtrnf_Qres: +;CHECK: vtrn.32 + %tmp1 = load <2 x float>, <2 x float>* %A + %tmp2 = load <2 x float>, <2 x float>* %B + %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> + ret <4 x float> %tmp3 +} + define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: vtrnQi8: ;CHECK: vtrn.8 @@ -60,6 +96,15 @@ ret <16 x i8> %tmp5 } +define <32 x i8> @vtrnQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vtrnQi8_QQres: +;CHECK: vtrn.8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +} + define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: vtrnQi16: ;CHECK: vtrn.16 @@ -72,6 +117,15 @@ ret <8 x i16> %tmp5 } +define <16 x i16> @vtrnQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vtrnQi16_QQres: +;CHECK: vtrn.16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} + define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: vtrnQi32: ;CHECK: vtrn.32 @@ -84,6 +138,15 @@ ret <4 x i32> %tmp5 } +define <8 x i32> @vtrnQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vtrnQi32_QQres: +;CHECK: vtrn.32 + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + ret <8 x i32> %tmp3 +} + define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind { ;CHECK-LABEL: vtrnQf: ;CHECK: vtrn.32 @@ -96,6 +159,15 @@ ret <4 x float> %tmp5 } +define <8 x float> @vtrnQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK-LABEL: vtrnQf_QQres: +;CHECK: vtrn.32 + %tmp1 = load <4 x float>, <4 x float>* %A + %tmp2 = load <4 x float>, <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> + ret <8 x float> %tmp3 +} + ; Undef shuffle indices should not prevent matching to VTRN: define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { @@ -110,6 +182,15 @@ ret <8 x i8> %tmp5 } +define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vtrni8_undef_Qres: +;CHECK: vtrn.8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: vtrnQi16_undef: ;CHECK: vtrn.16 @@ -122,3 +203,11 @@ ret <8 x i16> %tmp5 } +define <16 x i16> @vtrnQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vtrnQi16_undef_QQres: +;CHECK: vtrn.16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -12,6 +12,15 @@ ret <8 x i8> %tmp5 } +define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vuzpi8_Qres: +;CHECK: vuzp.8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK-LABEL: vuzpi16: ;CHECK: vuzp.16 @@ -24,6 +33,15 @@ ret <4 x i16> %tmp5 } +define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vuzpi16_Qres: +;CHECK: vuzp.16 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %tmp3 +} + ; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors. define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { @@ -38,6 +56,15 @@ ret <16 x i8> %tmp5 } +define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vuzpQi8_QQres: +;CHECK: vuzp.8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +} + define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: vuzpQi16: ;CHECK: vuzp.16 @@ -50,6 +77,15 @@ ret <8 x i16> %tmp5 } +define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vuzpQi16_QQres: +;CHECK: vuzp.16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} + define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: vuzpQi32: ;CHECK: vuzp.32 @@ -62,6 +98,15 @@ ret <4 x i32> %tmp5 } +define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vuzpQi32_QQres: +;CHECK: vuzp.32 + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + ret <8 x i32> %tmp3 +} + define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { ;CHECK-LABEL: vuzpQf: ;CHECK: vuzp.32 @@ -74,6 +119,15 @@ ret <4 x float> %tmp5 } +define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK-LABEL: vuzpQf_QQres: +;CHECK: vuzp.32 + %tmp1 = load <4 x float>, <4 x float>* %A + %tmp2 = load <4 x float>, <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> + ret <8 x float> %tmp3 +} + ; Undef shuffle indices should not prevent matching to VUZP: define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { @@ -88,6 +142,15 @@ ret <8 x i8> %tmp5 } +define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vuzpi8_undef_Qres: +;CHECK: vuzp.8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: vuzpQi16_undef: ;CHECK: vuzp.16 @@ -100,3 +163,11 @@ ret <8 x i16> %tmp5 } +define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vuzpQi16_undef_QQres: +;CHECK: vuzp.16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -12,6 +12,15 @@ ret <8 x i8> %tmp5 } +define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vzipi8_Qres: +;CHECK: vzip.8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK-LABEL: vzipi16: ;CHECK: vzip.16 @@ -24,6 +33,15 @@ ret <4 x i16> %tmp5 } +define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vzipi16_Qres: +;CHECK: vzip.16 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %tmp3 +} + ; VZIP.32 is equivalent to VTRN.32 for 64-bit vectors. define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { @@ -38,6 +56,15 @@ ret <16 x i8> %tmp5 } +define <32 x i8> @vzipQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vzipQi8_QQres: +;CHECK: vzip.8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +} + define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: vzipQi16: ;CHECK: vzip.16 @@ -50,6 +77,15 @@ ret <8 x i16> %tmp5 } +define <16 x i16> @vzipQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vzipQi16_QQres: +;CHECK: vzip.16 + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp2 = load <8 x i16>, <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> + ret <16 x i16> %tmp3 +} + define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: vzipQi32: ;CHECK: vzip.32 @@ -62,6 +98,15 @@ ret <4 x i32> %tmp5 } +define <8 x i32> @vzipQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vzipQi32_QQres: +;CHECK: vzip.32 + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + ret <8 x i32> %tmp3 +} + define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind { ;CHECK-LABEL: vzipQf: ;CHECK: vzip.32 @@ -74,6 +119,15 @@ ret <4 x float> %tmp5 } +define <8 x float> @vzipQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK-LABEL: vzipQf_QQres: +;CHECK: vzip.32 + %tmp1 = load <4 x float>, <4 x float>* %A + %tmp2 = load <4 x float>, <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> + ret <8 x float> %tmp3 +} + ; Undef shuffle indices should not prevent matching to VZIP: define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { @@ -88,6 +142,15 @@ ret <8 x i8> %tmp5 } +define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: vzipi8_undef_Qres: +;CHECK: vzip.8 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = load <8 x i8>, <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: vzipQi8_undef: ;CHECK: vzip.8 @@ -100,3 +163,11 @@ ret <16 x i8> %tmp5 } +define <32 x i8> @vzipQi8_undef_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: vzipQi8_undef_QQres: +;CHECK: vzip.8 + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = load <16 x i8>, <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> + ret <32 x i8> %tmp3 +}