diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -879,6 +879,8 @@ if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine({ISD::SELECT, ISD::VSELECT}); @@ -17144,7 +17146,8 @@ return SDValue(); } -static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { assert((N->getOpcode() == AArch64ISD::UUNPKHI || N->getOpcode() == AArch64ISD::UUNPKLO) && "Unexpected Opcode!"); @@ -17153,6 +17156,42 @@ if (N->getOperand(0).isUndef()) return DAG.getUNDEF(N->getValueType(0)); + // If this is a masked load followed by an UUNPKLO, fold this into a masked + // extending load. We can do this even if this is already a masked + // {z,}extload. + if (N->getOperand(0).getOpcode() == ISD::MLOAD && + N->getOpcode() == AArch64ISD::UUNPKLO) { + MaskedLoadSDNode *MLD = cast(N->getOperand(0)); + SDValue Mask = MLD->getMask(); + SDLoc DL(N); + + if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD && + SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE && + (MLD->getPassThru()->isUndef() || + isZerosVector(MLD->getPassThru().getNode()))) { + unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); + unsigned PgPattern = Mask->getConstantOperandVal(0); + EVT VT = N->getValueType(0); + + // Ensure we can double the size of the predicate pattern + unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); + if (NumElts && + NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) { + Mask = + getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern); + SDValue PassThru = DAG.getConstant(0, DL, VT); + SDValue NewLoad = DAG.getMaskedLoad( + VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask, + PassThru, MLD->getMemoryVT(), MLD->getMemOperand(), + MLD->getAddressingMode(), ISD::ZEXTLOAD); + + DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1)); + + return NewLoad; + } + } + } + return SDValue(); } @@ -17486,6 +17525,50 @@ return SDValue(); } +static SDValue performMSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + MaskedStoreSDNode *MST = cast(N); + SDValue Value = MST->getValue(); + SDValue Mask = MST->getMask(); + SDLoc DL(N); + + // If this is a UZP1 followed by a masked store, fold this into a masked + // truncating store. We can do this even if this is already a masked + // truncstore. + if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() && + MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE && + Value.getValueType().isInteger()) { + Value = Value.getOperand(0); + if (Value.getOpcode() == ISD::BITCAST) { + EVT HalfVT = + Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + EVT InVT = Value.getOperand(0).getValueType(); + + if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) { + unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); + unsigned PgPattern = Mask->getConstantOperandVal(0); + + // Ensure we can double the size of the predicate pattern + unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); + if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <= + MinSVESize) { + Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1), + PgPattern); + return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0), + MST->getBasePtr(), MST->getOffset(), Mask, + MST->getMemoryVT(), MST->getMemOperand(), + MST->getAddressingMode(), + /*IsTruncating=*/true); + } + } + } + } + + return SDValue(); +} + /// \return true if part of the index was folded into the Base. static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG) { @@ -19402,6 +19485,8 @@ break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); + case ISD::MSTORE: + return performMSTORECombine(N, DCI, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return performMaskedGatherScatterCombine(N, DCI, DAG); @@ -19426,7 +19511,7 @@ return performSpliceCombine(N, DAG); case AArch64ISD::UUNPKLO: case AArch64ISD::UUNPKHI: - return performUnpackCombine(N, DAG); + return performUnpackCombine(N, DAG, Subtarget); case AArch64ISD::UZP1: return performUzpCombine(N, DAG); case AArch64ISD::SETCC_MERGE_ZERO: diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -208,20 +208,19 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ptrue p1.s, vl8 -; CHECK-NEXT: subs x8, x8, #8 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: subs x8, x8, #8 +; CHECK-NEXT: ptrue p1.d, vl8 ; CHECK-NEXT: csel x8, xzr, x8, lo ; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: cmp x8, #8 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3] +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x9, x8, lsl #3] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -386,12 +386,10 @@ ; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ptrue p1.s ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.s -; VBITS_GE_512-NEXT: fcvtzu z0.s, p0/m, z0.s -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: fcvtzu z0.s, p1/m, z0.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %res = fptoui <16 x float> %op1 to <16 x i16> @@ -403,12 +401,10 @@ ; CHECK-LABEL: fcvtzu_v32f32_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzu z0.s, p1/m, z0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a %res = fptoui <32 x float> %op1 to <32 x i16> @@ -420,12 +416,10 @@ ; CHECK-LABEL: fcvtzu_v64f32_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzu z0.s, p1/m, z0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a %res = fptoui <64 x float> %op1 to <64 x i16> @@ -714,13 +708,10 @@ ; CHECK-LABEL: fcvtzu_v16f64_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzu z0.d, p1/m, z0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptoui <16 x double> %op1 to <16 x i16> @@ -732,13 +723,10 @@ ; CHECK-LABEL: fcvtzu_v32f64_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzu z0.d, p1/m, z0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %res = fptoui <32 x double> %op1 to <32 x i16> @@ -809,12 +797,10 @@ ; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ptrue p1.d ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.d -; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.d -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: fcvtzu z0.d, p1/m, z0.d +; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptoui <8 x double> %op1 to <8 x i32> @@ -826,12 +812,10 @@ ; CHECK-LABEL: fcvtzu_v16f64_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: fcvtzu z0.d, p1/m, z0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptoui <16 x double> %op1 to <16 x i32> @@ -843,12 +827,10 @@ ; CHECK-LABEL: fcvtzu_v32f64_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: fcvtzu z0.d, p1/m, z0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %res = fptoui <32 x double> %op1 to <32 x i32> @@ -1330,12 +1312,10 @@ ; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ptrue p1.s ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.s -; VBITS_GE_512-NEXT: fcvtzs z0.s, p0/m, z0.s -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: fcvtzs z0.s, p1/m, z0.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %res = fptosi <16 x float> %op1 to <16 x i16> @@ -1347,12 +1327,10 @@ ; CHECK-LABEL: fcvtzs_v32f32_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzs z0.s, p1/m, z0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a %res = fptosi <32 x float> %op1 to <32 x i16> @@ -1364,12 +1342,10 @@ ; CHECK-LABEL: fcvtzs_v64f32_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzs z0.s, p1/m, z0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a %res = fptosi <64 x float> %op1 to <64 x i16> @@ -1658,13 +1634,10 @@ ; CHECK-LABEL: fcvtzs_v16f64_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzs z0.d, p1/m, z0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptosi <16 x double> %op1 to <16 x i16> @@ -1676,13 +1649,10 @@ ; CHECK-LABEL: fcvtzs_v32f64_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcvtzs z0.d, p1/m, z0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %res = fptosi <32 x double> %op1 to <32 x i16> @@ -1753,12 +1723,10 @@ ; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ptrue p1.d ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.d -; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.d -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: fcvtzs z0.d, p1/m, z0.d +; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptosi <8 x double> %op1 to <8 x i32> @@ -1770,12 +1738,10 @@ ; CHECK-LABEL: fcvtzs_v16f64_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: fcvtzs z0.d, p1/m, z0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptosi <16 x double> %op1 to <16 x i32> @@ -1787,12 +1753,10 @@ ; CHECK-LABEL: fcvtzs_v32f64_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: fcvtzs z0.d, p1/m, z0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %res = fptosi <32 x double> %op1 to <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -159,17 +159,15 @@ ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -182,17 +180,15 @@ ; CHECK-LABEL: sdiv_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b @@ -205,20 +201,20 @@ ; CHECK-LABEL: sdiv_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpkhi z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b @@ -394,14 +390,13 @@ ; VBITS_GE_512-LABEL: sdiv_v16i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b @@ -414,14 +409,13 @@ ; CHECK-LABEL: sdiv_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b @@ -434,14 +428,13 @@ ; CHECK-LABEL: sdiv_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b @@ -871,18 +864,11 @@ define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: udiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0] +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -894,18 +880,11 @@ define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: udiv_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ptrue p1.s, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0] +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b @@ -917,21 +896,18 @@ define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: udiv_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ptrue p1.s, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z2.s, z1.h -; CHECK-NEXT: uunpkhi z3.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b @@ -1106,15 +1082,11 @@ ; ; VBITS_GE_512-LABEL: udiv_v16i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b @@ -1126,15 +1098,11 @@ define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: udiv_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b @@ -1146,15 +1114,11 @@ define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: udiv_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ptrue p1.s, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -161,10 +161,8 @@ ; ; VBITS_GE_512-LABEL: ucvtf_v16i16_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret @@ -177,10 +175,8 @@ define void @ucvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: ucvtf_v32i16_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret @@ -193,10 +189,8 @@ define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: ucvtf_v64i16_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret @@ -289,11 +283,8 @@ define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: ucvtf_v16i16_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -306,11 +297,8 @@ define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: ucvtf_v32i16_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -582,10 +570,8 @@ ; ; VBITS_GE_512-LABEL: ucvtf_v8i32_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ucvtf z0.d, p0/m, z0.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret @@ -598,10 +584,8 @@ define void @ucvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: ucvtf_v16i32_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -614,10 +598,8 @@ define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: ucvtf_v32i32_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll @@ -31,10 +31,7 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1b { z0.h }, p0, [x0] +; CHECK-NEXT: st1b { z0.d }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <4 x i8*>, <4 x i8*>* %b %vals = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 8, <4 x i1> , <4 x i8> undef) @@ -99,11 +96,7 @@ ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z0.d }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <32 x i8*>, <32 x i8*>* %b %vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*> %ptrs, i32 8, <32 x i1> , <16 x i16*>* %b %vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*> %ptrs, i32 8, <16 x i1> , <32 x i16*>* %b %vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*> %ptrs, i32 8, <32 x i1> , <8 x i32*>* %b %vals = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptrs, i32 8, <8 x i1> , <8 x i32> undef) @@ -292,9 +277,7 @@ ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z0.d }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <16 x i32*>, <16 x i32*>* %b %vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 8, <16 x i1> , <32 x i32*>* %b %vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*> %ptrs, i32 8, <32 x i1> , <4 x i8>* %a %ptrs = load <4 x i8*>, <4 x i8*>* %b @@ -146,15 +143,12 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, #0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1b { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1b { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1b { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <32 x i8>, <32 x i8>* %a %ptrs = load <32 x i8*>, <32 x i8*>* %b @@ -271,13 +265,11 @@ ; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1h { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <16 x i16>, <16 x i16>* %a %ptrs = load <16 x i16*>, <16 x i16*>* %b @@ -294,13 +286,11 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1h { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <32 x i16>, <32 x i16>* %a %ptrs = load <32 x i16*>, <32 x i16*>* %b @@ -387,11 +377,10 @@ ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: punpklo p1.h, p1.b -; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: st1w { z0.d }, p1, [x0] ; VBITS_GE_512-NEXT: ret %cval = load <8 x i32>, <8 x i32>* %a %ptrs = load <8 x i32*>, <8 x i32*>* %b @@ -408,11 +397,10 @@ ; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <16 x i32>, <16 x i32>* %a %ptrs = load <16 x i32*>, <16 x i32*>* %b @@ -429,11 +417,10 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <32 x i32>, <32 x i32>* %a %ptrs = load <32 x i32*>, <32 x i32*>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-uunpklo-load-uzp1-store-combine.ll b/llvm/test/CodeGen/AArch64/sve-uunpklo-load-uzp1-store-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-uunpklo-load-uzp1-store-combine.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Check that we don't try and merge uunpklo/uzp1 with a load or store if we +; would end up creating a predicate that would be too large for the max VL. + +; UUNPKLO + Load + +define @uunpklo_i8_valid(ptr %b) #0 { +; CHECK-LABEL: uunpklo_i8_valid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %mask = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 11) + %load = call @llvm.masked.load.nxv16i8(ptr %b, i32 2, %mask, undef) + %uzp = call @llvm.aarch64.sve.uunpklo.nxv8i16( %load) + ret %uzp +} + +define @uunpklo_i8_invalid(ptr %b) #0 { +; CHECK-LABEL: uunpklo_i8_invalid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: ret + %mask = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 12) + %load = call @llvm.masked.load.nxv16i8(ptr %b, i32 2, %mask, undef) + %uzp = call @llvm.aarch64.sve.uunpklo.nxv8i16( %load) + ret %uzp +} + +define @uunpklo_i16_valid(ptr %b) #0 { +; CHECK-LABEL: uunpklo_i16_valid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %mask = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 10) + %load = call @llvm.masked.load.nxv8i16(ptr %b, i32 2, %mask, undef) + %uzp = call @llvm.aarch64.sve.uunpklo.nxv4i32( %load) + ret %uzp +} + +define @uunpklo_i16_invalid(ptr %b) #0 { +; CHECK-LABEL: uunpklo_i16_invalid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ret + %mask = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 11) + %load = call @llvm.masked.load.nxv8i16(ptr %b, i32 2, %mask, undef) + %uzp = call @llvm.aarch64.sve.uunpklo.nxv4i32( %load) + ret %uzp +} + +define @uunpklo_i32_valid(ptr %b) #0 { +; CHECK-LABEL: uunpklo_i32_valid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %mask = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 9) + %load = call @llvm.masked.load.nxv4i32(ptr %b, i32 2, %mask, undef) + %uzp = call @llvm.aarch64.sve.uunpklo.nxv2i64( %load) + ret %uzp +} + +define @uunpklo_i32_invalid(ptr %b) #0 { +; CHECK-LABEL: uunpklo_i32_invalid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ret + %mask = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 10) + %load = call @llvm.masked.load.nxv4i32(ptr %b, i32 2, %mask, undef) + %uzp = call @llvm.aarch64.sve.uunpklo.nxv2i64( %load) + ret %uzp +} + +define @uunpklo_invalid_all(ptr %b) #0 { +; CHECK-LABEL: uunpklo_invalid_all: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ret + %mask = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %load = call @llvm.masked.load.nxv4i32(ptr %b, i32 2, %mask, undef) + %uzp = call @llvm.aarch64.sve.uunpklo.nxv2i64( %load) + ret %uzp +} + +; UZP1 + Store + +define void @uzp1_i8_valid( %a, ptr %b) #0 { +; CHECK-LABEL: uzp1_i8_valid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: st1b { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %a.bc = bitcast %a to + %uzp = call @llvm.aarch64.sve.uzp1.nxv16i8( %a.bc, %a.bc) + %mask = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 11) + call void @llvm.masked.store.nxv16i8( %uzp, ptr %b, i32 2, %mask) + ret void +} + +define void @uzp1_i8_invalid( %a, ptr %b) #0 { +; CHECK-LABEL: uzp1_i8_invalid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %a.bc = bitcast %a to + %uzp = call @llvm.aarch64.sve.uzp1.nxv16i8( %a.bc, %a.bc) + %mask = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 12) + call void @llvm.masked.store.nxv16i8( %uzp, ptr %b, i32 2, %mask) + ret void +} + +define void @uzp1_i16_valid( %a, ptr %b) #0 { +; CHECK-LABEL: uzp1_i16_valid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %a.bc = bitcast %a to + %uzp = call @llvm.aarch64.sve.uzp1.nxv8i16( %a.bc, %a.bc) + %mask = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 10) + call void @llvm.masked.store.nxv8i16( %uzp, ptr %b, i32 2, %mask) + ret void +} + +define void @uzp1_i16_invalid( %a, ptr %b) #0 { +; CHECK-LABEL: uzp1_i16_invalid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %a.bc = bitcast %a to + %uzp = call @llvm.aarch64.sve.uzp1.nxv8i16( %a.bc, %a.bc) + %mask = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 11) + call void @llvm.masked.store.nxv8i16( %uzp, ptr %b, i32 2, %mask) + ret void +} + +define void @uzp1_i32_valid( %a, ptr %b) #0 { +; CHECK-LABEL: uzp1_i32_valid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %a.bc = bitcast %a to + %uzp = call @llvm.aarch64.sve.uzp1.nxv4i32( %a.bc, %a.bc) + %mask = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 9) + call void @llvm.masked.store.nxv4i32( %uzp, ptr %b, i32 2, %mask) + ret void +} + +define void @uzp1_i32_invalid( %a, ptr %b) #0 { +; CHECK-LABEL: uzp1_i32_invalid: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %a.bc = bitcast %a to + %uzp = call @llvm.aarch64.sve.uzp1.nxv4i32( %a.bc, %a.bc) + %mask = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 10) + call void @llvm.masked.store.nxv4i32( %uzp, ptr %b, i32 2, %mask) + ret void +} + +define void @uzp1_invalid_all( %a, ptr %b) #0 { +; CHECK-LABEL: uzp1_invalid_all: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %a.bc = bitcast %a to + %uzp = call @llvm.aarch64.sve.uzp1.nxv4i32( %a.bc, %a.bc) + %mask = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + call void @llvm.masked.store.nxv4i32( %uzp, ptr %b, i32 2, %mask) + ret void +} + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern) + +declare @llvm.aarch64.sve.uunpklo.nxv8i16() +declare @llvm.aarch64.sve.uunpklo.nxv4i32() +declare @llvm.aarch64.sve.uunpklo.nxv2i64() + +declare @llvm.aarch64.sve.uzp1.nxv16i8(, ) +declare @llvm.aarch64.sve.uzp1.nxv8i16(, ) +declare @llvm.aarch64.sve.uzp1.nxv4i32(, ) + +declare @llvm.masked.load.nxv16i8(*, i32, , ) +declare @llvm.masked.load.nxv8i16(*, i32, , ) +declare @llvm.masked.load.nxv4i32(*, i32, , ) + +declare void @llvm.masked.store.nxv16i8(, *, i32, ) +declare void @llvm.masked.store.nxv8i16(, *, i32, ) +declare void @llvm.masked.store.nxv4i32(, *, i32, ) + +attributes #0 = { "target-features"="+sve" vscale_range(8,0) }