Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -210,6 +210,7 @@ // MVE float <> half converts VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes + VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes // Vector multiply long: VMULLs, // ...signed Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1677,6 +1677,7 @@ case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs"; case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu"; case ARMISD::VCVTN: return "ARMISD::VCVTN"; + case ARMISD::VCVTL: return "ARMISD::VCVTL"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::VADDVs: return "ARMISD::VADDVs"; @@ -7177,6 +7178,50 @@ DAG.getConstant(1, dl, MVT::i32)); } +// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extract +// from a single input on alternating lanes. For example: +// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), +// FP_ROUND(EXTRACT_ELT(X, 2), +// FP_ROUND(EXTRACT_ELT(X, 4), ...) +static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + if (!ST->hasMVEFloatOps()) + return SDValue(); + + SDLoc dl(BV); + EVT VT = BV.getValueType(); + if (VT != MVT::v4f32) + return SDValue(); + + // We are looking for a buildvector of fptext elements, where all the + // elements are alternating lanes from a single source. For example <0,2,4,6> + // or <1,3,5,7>. Check the first two items are valid enough and extract some + // info from them (they are checked properly in the loop below). + if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || + BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); + int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); + if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) + return SDValue(); + + // Check all the values in the BuildVector line up with our expectations. + for (unsigned i = 1; i < 4; i++) { + auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { + return Trunc.getOpcode() == ISD::FP_EXTEND && + Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Trunc.getOperand(0).getOperand(0) == Op && + Trunc.getOperand(0).getConstantOperandVal(1) == Idx; + }; + if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) + return SDValue(); + } + + return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, + DAG.getConstant(Offset, dl, MVT::i32)); +} + // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -7439,9 +7484,12 @@ if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; - // Attempt to turn a buildvector of scalar fptrunc's back into VCVT's + // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into + // VCVT's if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) return VCVT; + if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) + return VCVT; if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4772,7 +4772,10 @@ let retainsPreviousHalfElement = 1; } +def SDTARMVCVTL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>; def MVEvcvtn : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>; +def MVEvcvtl : SDNode<"ARMISD::VCVTL", SDTARMVCVTL>; multiclass MVE_VCVT_f2h_m { def "": MVE_VCVT_ff; + + def : Pat<(v4f32 (MVEvcvtl (v8f16 MQPR:$Qm), (i32 half))), + (v4f32 (Inst (v8f16 MQPR:$Qm)))>; } } Index: llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -556,8 +556,6 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r2, .LCPI9_0 ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] @@ -565,26 +563,15 @@ ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vcvtb.f32.f16 s11, s8 -; CHECK-NEXT: vmovx.f16 s13, s5 -; CHECK-NEXT: vcvtb.f32.f16 s10, s14 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vcvtb.f32.f16 s9, s13 -; CHECK-NEXT: vcvtb.f32.f16 s19, s7 -; CHECK-NEXT: vcvtb.f32.f16 s18, s6 -; CHECK-NEXT: vcvtb.f32.f16 s17, s5 -; CHECK-NEXT: vcvtb.f32.f16 s16, s4 -; CHECK-NEXT: vcvtb.f32.f16 s8, s12 -; CHECK-NEXT: vmul.f32 q1, q4, q0 +; CHECK-NEXT: vcvtb.f32.f16 q2, q1 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 ; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q1, q2 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtt.f16.f32 q2, q1 +; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -625,8 +612,6 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r2, .LCPI10_0 ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] @@ -634,44 +619,23 @@ ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vcvtb.f32.f16 s11, s8 -; CHECK-NEXT: vmovx.f16 s13, s5 -; CHECK-NEXT: vcvtb.f32.f16 s10, s14 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vcvtb.f32.f16 s9, s13 -; CHECK-NEXT: vcvtb.f32.f16 s19, s7 -; CHECK-NEXT: vcvtb.f32.f16 s18, s6 -; CHECK-NEXT: vcvtb.f32.f16 s17, s5 -; CHECK-NEXT: vcvtb.f32.f16 s16, s4 -; CHECK-NEXT: vcvtb.f32.f16 s8, s12 -; CHECK-NEXT: vmul.f32 q1, q4, q0 +; CHECK-NEXT: vcvtb.f32.f16 q2, q1 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 ; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q1, q2 -; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtt.f16.f32 q2, q1 ; CHECK-NEXT: vldrh.u16 q1, [r0, #16]! -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vcvtb.f32.f16 s19, s12 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vcvtb.f32.f16 s18, s14 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vcvtb.f32.f16 s17, s8 -; CHECK-NEXT: vcvtb.f32.f16 s16, s10 -; CHECK-NEXT: vcvtb.f32.f16 s11, s7 -; CHECK-NEXT: vcvtb.f32.f16 s10, s6 -; CHECK-NEXT: vcvtb.f32.f16 s9, s5 -; CHECK-NEXT: vcvtb.f32.f16 s8, s4 -; CHECK-NEXT: vmul.f32 q1, q4, q0 +; CHECK-NEXT: vstrh.16 q2, [r1] +; CHECK-NEXT: vcvtb.f32.f16 q2, q1 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 ; CHECK-NEXT: vmul.f32 q2, q2, q0 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vcvtb.f16.f32 q2, q2 ; CHECK-NEXT: vcvtt.f16.f32 q2, q1 ; CHECK-NEXT: vstrb.8 q2, [r1, #16]! ; CHECK-NEXT: le lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: Index: llvm/test/CodeGen/Thumb2/mve-shuffleext.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffleext.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffleext.ll @@ -196,11 +196,7 @@ define arm_aapcs_vfpcc <4 x float> @fpext_0246(<8 x half> %src) { ; CHECK-LABEL: fpext_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f32.f16 s7, s3 -; CHECK-NEXT: vcvtb.f32.f16 s6, s2 -; CHECK-NEXT: vcvtb.f32.f16 s5, s1 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x half> %src, <8 x half> undef, <4 x i32> @@ -211,14 +207,7 @@ define arm_aapcs_vfpcc <4 x float> @fpext_1357(<8 x half> %src) { ; CHECK-LABEL: fpext_1357: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vcvtb.f32.f16 s3, s8 -; CHECK-NEXT: vcvtb.f32.f16 s2, s10 -; CHECK-NEXT: vcvtb.f32.f16 s1, s4 -; CHECK-NEXT: vcvtb.f32.f16 s0, s6 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x half> %src, <8 x half> undef, <4 x i32> @@ -229,16 +218,8 @@ define arm_aapcs_vfpcc <8 x float> @fpext_02468101214(<16 x half> %src) { ; CHECK-LABEL: fpext_02468101214: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f32.f16 s11, s3 -; CHECK-NEXT: vcvtb.f32.f16 s10, s2 -; CHECK-NEXT: vcvtb.f32.f16 s9, s1 -; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vcvtb.f32.f16 s15, s7 -; CHECK-NEXT: vcvtb.f32.f16 s14, s6 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vcvtb.f32.f16 s13, s5 -; CHECK-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> @@ -249,22 +230,8 @@ define arm_aapcs_vfpcc <8 x float> @fpext_13579111315(<16 x half> %src) { ; CHECK-LABEL: fpext_13579111315: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmovx.f16 s5, s3 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s7, s2 -; CHECK-NEXT: vcvtb.f32.f16 s3, s5 -; CHECK-NEXT: vcvtb.f32.f16 s2, s7 -; CHECK-NEXT: vcvtb.f32.f16 s1, s4 -; CHECK-NEXT: vcvtb.f32.f16 s0, s6 -; CHECK-NEXT: vcvtb.f32.f16 s7, s12 -; CHECK-NEXT: vcvtb.f32.f16 s6, s14 -; CHECK-NEXT: vcvtb.f32.f16 s5, s8 -; CHECK-NEXT: vcvtb.f32.f16 s4, s10 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vcvt16.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -298,11 +298,8 @@ define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(<8 x half>* %src) { ; CHECK-LABEL: load_shuffleext_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vcvtb.f32.f16 s3, s7 -; CHECK-NEXT: vcvtb.f32.f16 s2, s6 -; CHECK-NEXT: vcvtb.f32.f16 s1, s5 -; CHECK-NEXT: vcvtb.f32.f16 s0, s4 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x half>, <8 x half>* %src, align 4