Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7562,7 +7562,7 @@ isVTBLMask(M, VT) || isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) return true; - else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && + else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && isReverseMask(M, VT)) return true; else if (Subtarget->hasMVEIntegerOps() && @@ -7653,21 +7653,21 @@ DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); } -static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, - SelectionDAG &DAG) { +static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); - SDValue OpLHS = Op.getOperand(0); - EVT VT = OpLHS.getValueType(); + EVT VT = Op.getValueType(); - assert((VT == MVT::v8i16 || VT == MVT::v16i8) && - "Expect an v8i16/v16i8 type"); - OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); - // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, + SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0)); + // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now, // extract the first 8 bytes into the top double word and the last 8 bytes - // into the bottom double word. The v8i16 case is similar. - unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; - return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, - DAG.getConstant(ExtractNum, DL, MVT::i32)); + // into the bottom double word, through a new vector shuffle that will be + // turned into a VEXT on Neon, or a couple of VMOVDs on MVE. + std::vector NewMask; + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) + NewMask.push_back(VT.getVectorNumElements() / 2 + i); + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) + NewMask.push_back(i); + return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask); } static EVT getVectorTyFromPredicateVector(EVT VT) { @@ -8034,8 +8034,9 @@ return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) - return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); + if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && + isReverseMask(ShuffleMask, VT)) + return LowerReverse_VECTOR_SHUFFLE(Op, DAG); if (ST->hasNEON() && VT == MVT::v8i8) if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -633,6 +633,22 @@ LT.second)) return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(); } + if (Kind == TTI::SK_Reverse) { + static const CostTblEntry MVERevTbl[] = { + // FIXME: These should take 3 instructions, a VREV64 and 2 VMOVD. We + // currently generate 4 VMOVS's + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 5}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 5}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 5}, + {ISD::VECTOR_SHUFFLE, MVT::v8f16, 5}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 5}}; + + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + if (const auto *Entry = + CostTableLookup(MVERevTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(); + } } int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() ? ST->getMVEVectorCostFactor() Index: llvm/test/Analysis/CostModel/ARM/shuffle.ll =================================================================== --- llvm/test/Analysis/CostModel/ARM/shuffle.ll +++ llvm/test/Analysis/CostModel/ARM/shuffle.ll @@ -56,17 +56,17 @@ define void @reverse() { ; CHECK-MVE-LABEL: 'reverse' ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v18 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v18 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-LABEL: 'reverse' Index: llvm/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -62,23 +62,11 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle1_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vrev64.16 q1, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -141,39 +129,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) { ; CHECK-LABEL: shuffle1_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vrev64.8 q1, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> @@ -345,27 +305,11 @@ define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) { ; CHECK-LABEL: shuffle1_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vrev64.16 q1, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-shufflemov.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -35,23 +35,11 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_76543210: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vrev64.16 q1, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -188,39 +176,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_fedcba9876543210(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_fedcba9876543210: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vrev64.8 q1, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -346,27 +306,11 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_76543210: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vrev64.16 q1, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32>