diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -15841,6 +15842,23 @@ return SDValue(); } +static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) { + // sunpklo(sext(pred)) -> sext(extract_low_half(pred)) + // This transform works in partnership with performSetCCPunpkCombine to + // remove unnecessary transfer of predicates into standard registers and back + if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() == + MVT::i1) { + SDValue CC = N->getOperand(0)->getOperand(0); + auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext()); + SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC, + DAG.getVectorIdxConstant(0, SDLoc(N))); + return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk); + } + + return SDValue(); +} + /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, @@ -16518,6 +16536,44 @@ return SDValue(); } +static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) { + // setcc_merge_zero pred + // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne + // => extract_subvector (inner setcc_merge_zero) + SDValue Pred = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + ISD::CondCode Cond = cast(N->getOperand(3))->get(); + + if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) || + LHS->getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + + SDValue Extract = LHS->getOperand(0); + if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR || + Extract->getValueType(0) != N->getValueType(0) || + Extract->getConstantOperandVal(1) != 0) + return SDValue(); + + SDValue InnerSetCC = Extract->getOperand(0); + if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO) + return SDValue(); + + // By this point we've effectively got + // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive + // lanes are already zero then the trunc(sext()) sequence is redundant and we + // can operate on A directly. + SDValue InnerPred = InnerSetCC.getOperand(0); + if (Pred.getOpcode() == AArch64ISD::PTRUE && + InnerPred.getOpcode() == AArch64ISD::PTRUE && + Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) && + Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 && + Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256) + return Extract; + + return SDValue(); +} + static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && "Unexpected opcode!"); @@ -16536,6 +16592,9 @@ LHS->getOperand(0)->getOperand(0) == Pred) return LHS->getOperand(0); + if (SDValue V = performSetCCPunpkCombine(N, DAG)) + return V; + return SDValue(); } @@ -17479,6 +17538,8 @@ case AArch64ISD::VASHR: case AArch64ISD::VLSHR: return performVectorShiftCombine(N, *this, DCI); + case AArch64ISD::SUNPKLO: + return performSunpkloCombine(N, DAG); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -159,15 +159,13 @@ ; VBITS_GE_2048-LABEL: masked_gather_v32i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 -; VBITS_GE_2048-NEXT: ptrue p2.d, vl32 +; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1d { z1.d }, p2/z, [x1] +; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p1.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_2048-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p2/z, z0.d, #0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h @@ -289,13 +287,11 @@ ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_1024-NEXT: cmpeq p2.h, p0/z, z0.h, #0 -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [x1] -; VBITS_GE_1024-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_1024-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z1.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] @@ -314,13 +310,11 @@ ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: cmpeq p2.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -391,8 +385,9 @@ ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1] ; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: sunpklo z3.d, z0.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b ; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff ; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s ; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 ; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 @@ -411,10 +406,8 @@ ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p2.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: punpklo p1.h, p1.b ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] @@ -434,10 +427,8 @@ ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_1024-NEXT: cmpeq p2.s, p0/z, z0.s, #0 -; VBITS_GE_1024-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_1024-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b ; VBITS_GE_1024-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] @@ -457,10 +448,8 @@ ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p2.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] @@ -701,13 +690,11 @@ ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_1024-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [x1] -; VBITS_GE_1024-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z1.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] @@ -726,13 +713,11 @@ ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -798,10 +783,8 @@ ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_512-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_512-NEXT: punpklo p1.h, p1.b ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] @@ -821,10 +804,8 @@ ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_1024-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b ; VBITS_GE_1024-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] @@ -844,10 +825,8 @@ ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] @@ -988,13 +967,11 @@ ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d, lsl #1] +; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1017,10 +994,8 @@ ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] @@ -1063,13 +1038,11 @@ ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d, lsl #1] +; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1091,13 +1064,11 @@ ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d] +; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1120,13 +1091,11 @@ ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d] +; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1149,10 +1118,8 @@ ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] @@ -1173,10 +1140,8 @@ ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] @@ -1201,11 +1166,10 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z2.d, x2 ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_GE_2048-NEXT: movprfx z0, z1 +; VBITS_GE_2048-NEXT: add z0.d, p1/m, z0.d, z2.d +; VBITS_GE_2048-NEXT: punpklo p1.h, p2.b +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: ret @@ -1229,11 +1193,10 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z2.d, #4 // =0x4 ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_GE_2048-NEXT: movprfx z0, z1 +; VBITS_GE_2048-NEXT: add z0.d, p1/m, z0.d, z2.d +; VBITS_GE_2048-NEXT: punpklo p1.h, p2.b +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: ret @@ -1254,14 +1217,12 @@ ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x2] -; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [z1.d] +; VBITS_GE_2048-NEXT: punpklo p2.h, p1.b +; VBITS_GE_2048-NEXT: ld1w { z1.d }, p2/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_GE_2048-NEXT: mov z0.s, p2/m, z1.s +; VBITS_GE_2048-NEXT: mov z0.s, p1/m, z1.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a @@ -1280,10 +1241,8 @@ ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -268,11 +268,9 @@ ; VBITS_GE_512-NEXT: ptrue p0.b, vl32 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_512-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: cmpne p1.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -327,11 +325,9 @@ ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -366,11 +362,9 @@ ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -386,11 +380,9 @@ ; VBITS_GE_512-NEXT: ptrue p0.b, vl32 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_512-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: cmpne p1.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -445,11 +437,9 @@ ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -484,11 +474,9 @@ ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -690,6 +678,230 @@ ret <8 x i64> %ext } +define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <128 x i8>, <128 x i8>* %bp + %mask = icmp eq <128 x i8> %b, zeroinitializer + %load = call <128 x i8> @llvm.masked.load.v128i8(<128 x i8>* %ap, i32 8, <128 x i1> %mask, <128 x i8> undef) + %ext = sext <128 x i8> %load to <128 x i16> + ret <128 x i16> %ext +} + +define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <64 x i8>, <64 x i8>* %bp + %mask = icmp eq <64 x i8> %b, zeroinitializer + %load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) + %ext = sext <64 x i8> %load to <64 x i32> + ret <64 x i32> %ext +} + +define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <32 x i8>, <32 x i8>* %bp + %mask = icmp eq <32 x i8> %b, zeroinitializer + %load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) + %ext = sext <32 x i8> %load to <32 x i64> + ret <32 x i64> %ext +} + +define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <64 x i16>, <64 x i16>* %bp + %mask = icmp eq <64 x i16> %b, zeroinitializer + %load = call <64 x i16> @llvm.masked.load.v64i16(<64 x i16>* %ap, i32 8, <64 x i1> %mask, <64 x i16> undef) + %ext = sext <64 x i16> %load to <64 x i32> + ret <64 x i32> %ext +} + +define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <32 x i16>, <32 x i16>* %bp + %mask = icmp eq <32 x i16> %b, zeroinitializer + %load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) + %ext = sext <32 x i16> %load to <32 x i64> + ret <32 x i64> %ext +} + +define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <32 x i32>, <32 x i32>* %bp + %mask = icmp eq <32 x i32> %b, zeroinitializer + %load = call <32 x i32> @llvm.masked.load.v32i32(<32 x i32>* %ap, i32 8, <32 x i1> %mask, <32 x i32> undef) + %ext = sext <32 x i32> %load to <32 x i64> + ret <32 x i64> %ext +} + +define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <128 x i8>, <128 x i8>* %bp + %mask = icmp eq <128 x i8> %b, zeroinitializer + %load = call <128 x i8> @llvm.masked.load.v128i8(<128 x i8>* %ap, i32 8, <128 x i1> %mask, <128 x i8> undef) + %ext = zext <128 x i8> %load to <128 x i16> + ret <128 x i16> %ext +} + +define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1b { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <64 x i8>, <64 x i8>* %bp + %mask = icmp eq <64 x i8> %b, zeroinitializer + %load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) + %ext = zext <64 x i8> %load to <64 x i32> + ret <64 x i32> %ext +} + +define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <32 x i8>, <32 x i8>* %bp + %mask = icmp eq <32 x i8> %b, zeroinitializer + %load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) + %ext = zext <32 x i8> %load to <32 x i64> + ret <32 x i64> %ext +} + +define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <64 x i16>, <64 x i16>* %bp + %mask = icmp eq <64 x i16> %b, zeroinitializer + %load = call <64 x i16> @llvm.masked.load.v64i16(<64 x i16>* %ap, i32 8, <64 x i1> %mask, <64 x i16> undef) + %ext = zext <64 x i16> %load to <64 x i32> + ret <64 x i32> %ext +} + +define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <32 x i16>, <32 x i16>* %bp + %mask = icmp eq <32 x i16> %b, zeroinitializer + %load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) + %ext = zext <32 x i16> %load to <32 x i64> + ret <32 x i64> %ext +} + +define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { +; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret + %b = load <32 x i32>, <32 x i32>* %bp + %mask = icmp eq <32 x i32> %b, zeroinitializer + %load = call <32 x i32> @llvm.masked.load.v32i32(<32 x i32>* %ap, i32 8, <32 x i1> %mask, <32 x i32> undef) + %ext = zext <32 x i32> %load to <32 x i64> + ret <32 x i64> %ext +} + declare <2 x half> @llvm.masked.load.v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>) declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) @@ -698,6 +910,7 @@ declare <32 x float> @llvm.masked.load.v32f32(<32 x float>*, i32, <32 x i1>, <32 x float>) declare <64 x float> @llvm.masked.load.v64f32(<64 x float>*, i32, <64 x i1>, <64 x float>) +declare <128 x i8> @llvm.masked.load.v128i8(<128 x i8>*, i32, <128 x i1>, <128 x i8>) declare <64 x i8> @llvm.masked.load.v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>) declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>) declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) @@ -705,7 +918,9 @@ declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) +declare <32 x i32> @llvm.masked.load.v32i32(<32 x i32>*, i32, <32 x i1>, <32 x i32>) declare <32 x i16> @llvm.masked.load.v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>) +declare <64 x i16> @llvm.masked.load.v64i16(<64 x i16>*, i32, <64 x i1>, <64 x i16>) declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) declare <8 x i64> @llvm.masked.load.v8i64(<8 x i64>*, i32, <8 x i1>, <8 x i64>) declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -151,17 +151,15 @@ ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1d { z2.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_2048-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_2048-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z1.d, #0 -; VBITS_GE_2048-NEXT: st1b { z0.d }, p0, [z2.d] +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: st1b { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i8>, <32 x i8>* %a %ptrs = load <32 x i8*>, <32 x i8*>* %b @@ -272,11 +270,9 @@ ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b ; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i16>, <16 x i16>* %a @@ -295,11 +291,9 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i16>, <32 x i16>* %a @@ -358,20 +352,21 @@ ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_EQ_256-NEXT: punpklo p1.h, p0.b +; VBITS_EQ_256-NEXT: mov z4.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff ; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1] -; VBITS_EQ_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_EQ_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: sunpklo z3.d, z1.s -; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_EQ_256-NEXT: cmpne p0.d, p1/z, z3.d, #0 -; VBITS_EQ_256-NEXT: uunpklo z3.d, z0.s -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ext z4.b, z4.b, z4.b, #16 +; VBITS_EQ_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: uunpklo z1.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s ; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z4.d, #0 ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: st1w { z3.d }, p0, [z4.d] +; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [z3.d] ; VBITS_EQ_256-NEXT: st1w { z0.d }, p1, [z2.d] ; VBITS_EQ_256-NEXT: ret ; @@ -383,9 +378,7 @@ ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_512-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i32>, <8 x i32>* %a @@ -404,9 +397,7 @@ ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b ; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i32>, <16 x i32>* %a @@ -425,9 +416,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i32>, <32 x i32>* %a @@ -646,11 +635,9 @@ ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b ; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x half>, <16 x half>* %a @@ -669,11 +656,9 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -734,9 +719,7 @@ ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_512-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x float>, <8 x float>* %a @@ -755,9 +738,7 @@ ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_1024-NEXT: punpklo p0.h, p0.b ; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x float>, <16 x float>* %a @@ -776,9 +757,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -906,11 +885,9 @@ ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -932,9 +909,7 @@ ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -975,11 +950,9 @@ ; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -1001,11 +974,9 @@ ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -1028,11 +999,9 @@ ; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -1054,9 +1023,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -1076,9 +1043,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -1101,10 +1066,8 @@ ; VBITS_GE_2048-NEXT: mov z2.d, x2 ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d -; VBITS_GE_2048-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.d, z3.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -1127,10 +1090,8 @@ ; VBITS_GE_2048-NEXT: mov z2.d, #4 // =0x4 ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d -; VBITS_GE_2048-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: sunpklo z2.d, z3.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-punpklo-combine.ll b/llvm/test/CodeGen/AArch64/sve-punpklo-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-punpklo-combine.ll @@ -0,0 +1,266 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s +target triple = "aarch64-unknown-linux-gnu" + +define @masked_load_sext_i8i16(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 10) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv8i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv8i16( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test ensures the two ptrues have the same vl +define @masked_load_sext_i8i16_ptrue_vl(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i16_ptrue_vl: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 11) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv8i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv8i16( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test enforces that both predicates are ptrues +define @masked_load_sext_i8i16_parg(i8* %ap, %b, %p0) #0 { +; CHECK-LABEL: masked_load_sext_i8i16_parg: +; CHECK: // %bb.0: +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ptrue p1.h, vl32 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpne p0.h, p1/z, z0.h, #0 +; CHECK-NEXT: ret + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv8i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv8i16( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +define @masked_load_sext_i8i32(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 10) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv4i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv4i32( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test ensures the two ptrues have the same vl +define @masked_load_sext_i8i32_ptrue_vl(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i32_ptrue_vl: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 11) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv4i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv4i32( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test enforces that both predicates are ptrues +define @masked_load_sext_i8i32_parg(i8* %ap, %b, %p0) #0 { +; CHECK-LABEL: masked_load_sext_i8i32_parg: +; CHECK: // %bb.0: +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpne p0.s, p1/z, z0.s, #0 +; CHECK-NEXT: ret + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv4i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv4i32( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +define @masked_load_sext_i8i64(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 10) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv2i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv2i64( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test ensures the two ptrues have the same vl +define @masked_load_sext_i8i64_ptrue_vl(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i64_ptrue_vl: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 11) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv2i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv2i64( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test enforces that both predicates are ptrues +define @masked_load_sext_i8i64_parg(i8* %ap, %b, %p0) #0 { +; CHECK-LABEL: masked_load_sext_i8i64_parg: +; CHECK: // %bb.0: +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpne p0.d, p1/z, z0.d, #0 +; CHECK-NEXT: ret + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv2i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv2i64( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test enforces that the ptrues have a specified vl +define @masked_load_sext_i8i16_ptrue_all(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i16_ptrue_all: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 11) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv8i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv8i16( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test enforces that the ptrues have a specified vl +define @masked_load_sext_i8i32_ptrue_all(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i32_ptrue_all: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 11) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv4i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 10) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv4i32( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +; This negative test enforces that the ptrues have a specified vl +define @masked_load_sext_i8i64_ptrue_all(i8* %ap, %b) #0 { +; CHECK-LABEL: masked_load_sext_i8i64_ptrue_all: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ret + %p0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %cmp = call @llvm.aarch64.sve.cmpeq.nxv16i8( %p0, %b, zeroinitializer) + %extract = call @llvm.experimental.vector.extract.nxv2i1.nxv16i1( %cmp, i64 0) + %ext1 = sext %extract to + %p1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %cmp1 = call @llvm.aarch64.sve.cmpne.nxv2i64( %p1, %ext1, zeroinitializer) + ret %cmp1 +} + +declare @llvm.aarch64.sve.cmpeq.nxv16i8(, , ) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +declare @llvm.aarch64.sve.ld1.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1.nxv16i8(, i8*) +declare @llvm.aarch64.sve.ld1.nxv8i8(, i8*) +declare @llvm.aarch64.sve.ld1.nxv4i8(, i8*) +declare @llvm.aarch64.sve.ld1.nxv2i8(, i8*) + +declare @llvm.experimental.vector.extract.nxv8i1.nxv16i1(, i64) +declare @llvm.experimental.vector.extract.nxv4i1.nxv16i1(, i64) +declare @llvm.experimental.vector.extract.nxv2i1.nxv16i1(, i64) + +declare @llvm.aarch64.sve.cmpne.nxv8i16(, , ) +declare @llvm.aarch64.sve.cmpne.nxv4i32(, , ) +declare @llvm.aarch64.sve.cmpne.nxv2i64(, , ) + + +declare @llvm.aarch64.sve.dup.x.nxv8i16(i16) +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) + + +attributes #0 = { "target-features"="+sve" }