diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15316,6 +15316,40 @@ return SDValue(); } +static bool isCheapToExtend(const SDValue &N) { + unsigned OC = N->getOpcode(); + return OC == ISD::LOAD || OC == ISD::MLOAD || + ISD::isConstantSplatVectorAllZeros(N.getNode()); +} + +static SDValue +performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // If we have (sext (setcc A B)) and A and B are cheap to extend, + // we can move the sext into the arguments and have the same result. For + // example, if A and B are both loads, we can make those extending loads and + // avoid an extra instruction. This pattern appears often in VLS code + // generation where the inputs to the setcc have a different size to the + // instruction that wants to use the result of the setcc. + assert(N->getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOpcode() == ISD::SETCC); + const SDValue SetCC = N->getOperand(0); + + if (isCheapToExtend(SetCC.getOperand(0)) && + isCheapToExtend(SetCC.getOperand(1))) { + const SDValue Ext1 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), + N->getValueType(0), SetCC.getOperand(0)); + const SDValue Ext2 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), + N->getValueType(0), SetCC.getOperand(1)); + + return DAG.getSetCC( + SDLoc(SetCC), N->getValueType(0), Ext1, Ext2, + cast(SetCC->getOperand(2).getNode())->get()); + } + + return SDValue(); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -15334,6 +15368,11 @@ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } + + if (N->getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOpcode() == ISD::SETCC) + return performSignExtendSetCCCombine(N, DCI, DAG); + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -265,12 +265,10 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -283,12 +281,9 @@ define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -302,13 +297,9 @@ define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr d0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -322,12 +313,10 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -340,12 +329,9 @@ define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -359,12 +345,10 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -377,12 +361,10 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -395,12 +377,9 @@ define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -414,13 +393,9 @@ define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr d0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -434,12 +409,10 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -452,12 +425,9 @@ define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -471,12 +441,10 @@ define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -681,12 +649,10 @@ define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <128 x i8>, <128 x i8>* %bp @@ -699,13 +665,10 @@ define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i8>, <64 x i8>* %bp @@ -718,14 +681,10 @@ define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -738,12 +697,10 @@ define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i16>, <64 x i16>* %bp @@ -756,13 +713,10 @@ define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i16>, <32 x i16>* %bp @@ -775,12 +729,10 @@ define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i32>, <32 x i32>* %bp @@ -793,12 +745,10 @@ define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <128 x i8>, <128 x i8>* %bp @@ -811,13 +761,10 @@ define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1b { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i8>, <64 x i8>* %bp @@ -830,14 +777,10 @@ define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -850,12 +793,10 @@ define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i16>, <64 x i16>* %bp @@ -868,13 +809,10 @@ define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i16>, <32 x i16>* %bp @@ -887,12 +825,10 @@ define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i32>, <32 x i32>* %bp