diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -919,6 +919,8 @@ setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); @@ -15260,6 +15262,99 @@ return SDValue(); } +static SDValue performFpExtendCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDLoc DL(N); + SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + if (!VT.isFixedLengthVector()) + return SDValue(); + + if (DAG.getTargetLoweringInfo().isTypeLegal(VT) || + !Subtarget->useSVEForFixedLengthVectors()) + return SDValue(); + + // In cases where the result of the FP_EXTEND is not legal, it will be + // expanded into multiple extract_subvectors which cannot be lowered without + // going through memory. + // + // If we push an extend into the load feeding the FP_EXTEND, we can force the + // load to be be expanded into the same number of parts as the FP_EXTEND, + // avoiding the need for extract_subvectors completely. + // + // As part of the lowering of FP_EXTEND for fixed length types uunpklo nodes + // will be introduced which will then combine with the truncate introduced + // after the load. + if (ISD::isNormalLoad(Op.getNode())) { + LoadSDNode *LD = cast(Op.getNode()); + + // Check if there are other uses. If so, do not combine as it will introduce + // an extra load. + for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); + UI != UE; ++UI) { + if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. + continue; + if (*UI != N) + return SDValue(); + } + + SDValue NewLoad = DAG.getExtLoad( + ISD::ZEXTLOAD, DL, VT.changeTypeToInteger(), LD->getChain(), + LD->getBasePtr(), LD->getMemoryVT().changeTypeToInteger(), + LD->getMemOperand()); + + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + + SDValue Trunc = DAG.getNode( + ISD::TRUNCATE, DL, Op->getValueType(0).changeTypeToInteger(), NewLoad); + SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Trunc); + + return DAG.getNode(ISD::FP_EXTEND, DL, VT, Bitcast); + } + + return SDValue(); +} + +static SDValue performUunpkloCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + // uunpklo(uzp1(x, x)) where x = bitcast(zextload) -> x + if (Op->getOpcode() == AArch64ISD::UZP1) { + EVT HalfVT = Op.getValueType(); + + // Ensure the unzip input is the same size as the unpack output + if (Op->getOperand(0)->getOpcode() != ISD::BITCAST || + Op->getValueType(0) == VT) + return SDValue(); + + SDValue Bitcast = Op->getOperand(0); + + // Look through bitcasts and unzips + SDValue Input = Bitcast->getOperand(0); + while (Input->getOpcode() == ISD::BITCAST || + (Input->getOpcode() == AArch64ISD::UZP1 && + Input->getOperand(0) == Input->getOperand(1))) + Input = Input->getOperand(0); + + // Input should come from an extending load + if (!isa(Input) || + cast(Input)->getExtensionType() != ISD::ZEXTLOAD) + return SDValue(); + + // Ensure that we don't care about the top half of the input + EVT MemVT = cast(Input)->getMemoryVT(); + if (isPackedVectorType(MemVT, DAG) && + MemVT.getVectorElementType().getScalarSizeInBits() <= + HalfVT.getScalarSizeInBits()) + return Bitcast->getOperand(0); + } + + return SDValue(); +} + static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { unsigned Opc = N->getOpcode(); @@ -16905,6 +17000,8 @@ return performUzpCombine(N, DAG); case AArch64ISD::SETCC_MERGE_ZERO: return performSetccMergeZeroCombine(N, DAG); + case ISD::FP_EXTEND: + return performFpExtendCombine(N, DAG, Subtarget); case AArch64ISD::GLD1_MERGE_ZERO: case AArch64ISD::GLD1_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_MERGE_ZERO: @@ -16923,6 +17020,8 @@ case AArch64ISD::VASHR: case AArch64ISD::VLSHR: return performVectorShiftCombine(N, *this, DCI); + case AArch64ISD::UUNPKLO: + return performUunpkloCombine(N, DAG); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -61,31 +61,17 @@ } define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 { -; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. +; Ensure sensible type legalisation. ; VBITS_EQ_256-LABEL: fcvt_v16f16_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; VBITS_EQ_256-NEXT: sub x9, sp, #48 -; VBITS_EQ_256-NEXT: mov x29, sp -; VBITS_EQ_256-NEXT: and sp, x9, #0xffffffffffffffe0 -; VBITS_EQ_256-NEXT: .cfi_def_cfa w29, 16 -; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 -; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: mov x8, sp -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x8] -; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h ; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_EQ_256-NEXT: mov sp, x29 -; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32: @@ -184,16 +170,12 @@ ; Ensure sensible type legalisation. ; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h -; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h +; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ret @@ -288,31 +270,17 @@ } define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 { -; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. +; Ensure sensible type legalisation. ; VBITS_EQ_256-LABEL: fcvt_v8f32_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; VBITS_EQ_256-NEXT: sub x9, sp, #48 -; VBITS_EQ_256-NEXT: mov x29, sp -; VBITS_EQ_256-NEXT: and sp, x9, #0xffffffffffffffe0 -; VBITS_EQ_256-NEXT: .cfi_def_cfa w29, 16 -; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 -; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: mov x8, sp -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8] -; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s ; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: mov sp, x29 -; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64: