diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -929,6 +929,8 @@ setTargetDAGCombine(ISD::VECREDUCE_ADD); setTargetDAGCombine(ISD::STEP_VECTOR); + setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::GlobalAddress); // In case of strict alignment, avoid an excessive number of byte wide stores. @@ -15940,6 +15942,21 @@ TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { + StoreSDNode *ST = cast(N); + SDValue Chain = ST->getChain(); + SDValue Value = ST->getValue(); + SDValue Ptr = ST->getBasePtr(); + + // If this is an FP_ROUND followed by a store, fold this into a truncating + // store. We can do this even if this is already a truncstore. + // We purposefully don't care about legality of the nodes here as we know + // they can be split down into something legal. + if (Value.getOpcode() == ISD::FP_ROUND && Value.getNode()->hasOneUse() && + ST->isUnindexed() && Subtarget->useSVEForFixedLengthVectors() && + Value.getValueType().isFixedLengthVector()) + return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + ST->getMemoryVT(), ST->getMemOperand()); + if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) return Split; @@ -17282,6 +17299,36 @@ return DAG.getBitcast(Ty, Trunc); } +SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. + if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) + return SDValue(); + + // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) + // We purposefully don't care about legality of the nodes here as we know + // they can be split down into something legal. + if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + Subtarget->useSVEForFixedLengthVectors() && VT.isFixedLengthVector()) { + LoadSDNode *LN0 = cast(N0); + SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, + LN0->getChain(), LN0->getBasePtr(), + N0.getValueType(), LN0->getMemOperand()); + DCI.CombineTo(N, ExtLoad); + DCI.CombineTo(N0.getNode(), + DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), + ExtLoad, DAG.getIntPtrConstant(1, SDLoc(N0))), + ExtLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + + return SDValue(); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17338,6 +17385,8 @@ return performSTORECombine(N, DCI, DAG, Subtarget); case ISD::VECTOR_SPLICE: return performSVESpliceCombine(N, DAG); + case ISD::FP_EXTEND: + return performFPExtendCombine(N, DAG, DCI, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -63,17 +63,14 @@ ; Ensure sensible type legalisation. ; VBITS_EQ_256-LABEL: fcvt_v16f16_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h +; VBITS_EQ_256-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1sh { z1.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32: @@ -164,18 +161,14 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 { ; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr q0, [x0] ; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: ld1sh { z0.d }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1sh { z1.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64: @@ -263,17 +256,14 @@ ; Ensure sensible type legalisation. ; VBITS_EQ_256-LABEL: fcvt_v8f32_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: uunpklo z1.d, z0.s -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s +; VBITS_EQ_256-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1sw { z1.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64: @@ -366,15 +356,10 @@ ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.s ; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.s ; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 -; VBITS_EQ_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16: @@ -573,15 +558,10 @@ ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d ; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.d ; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.d -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 -; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32: