Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -861,6 +861,7 @@ SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; + SDValue LowerToPredicatedSelect(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1084,6 +1084,7 @@ setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); } void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { @@ -3460,6 +3461,8 @@ return LowerSETCC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); + case ISD::VSELECT: + return LowerToPredicatedSelect(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: @@ -15020,6 +15023,31 @@ } } +static MVT getContainerForFixedLengthMask(EVT VT) { + MVT MaskVT; + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE predicate"); + case MVT::i8: + MaskVT = MVT::nxv16i1; + break; + case MVT::i16: + case MVT::f16: + MaskVT = MVT::nxv8i1; + break; + case MVT::i32: + case MVT::f32: + MaskVT = MVT::nxv4i1; + break; + case MVT::i64: + case MVT::f64: + MaskVT = MVT::nxv2i1; + break; + } + + return MaskVT; +} + // Return a PTRUE with active lanes corresponding to the extent of VT. static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { @@ -15064,27 +15092,7 @@ // use AArch64SVEPredPattern::all, which can enable the use of unpredicated // variants of instructions when available. - MVT MaskVT; - switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { - default: - llvm_unreachable("unexpected element type for SVE predicate"); - case MVT::i8: - MaskVT = MVT::nxv16i1; - break; - case MVT::i16: - case MVT::f16: - MaskVT = MVT::nxv8i1; - break; - case MVT::i32: - case MVT::f32: - MaskVT = MVT::nxv4i1; - break; - case MVT::i64: - case MVT::f64: - MaskVT = MVT::nxv2i1; - break; - } - + MVT MaskVT = getContainerForFixedLengthMask(VT); return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, DAG.getTargetConstant(PgPattern, DL, MVT::i64)); } @@ -15104,6 +15112,25 @@ return getPredicateForScalableVector(DAG, DL, VT); } +static SDValue convertToScalableMask(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + "Expected to convert into a scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected a fixed length vector operand!"); + SDLoc DL(V); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, V.getValueType()); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + + // FIXME: Fixed width vXi1 are not legal types. Insert the zero extended + // mask into a scalable vector, and then truncate to nxvXi1. Is there + // a better sequence to do this?? + SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), V, Zero); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Ins); + + return Trunc; +} + // Grow V to consume an entire SVE register. static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { assert(VT.isScalableVector() && @@ -15237,3 +15264,31 @@ return DAG.getNode(NewOp, DL, VT, Operands); } + +SDValue +AArch64TargetLowering::LowerToPredicatedSelect(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + if (useSVEForFixedLengthVectorVT(VT)) { + auto Pg = getPredicateForVector(DAG, DL, VT); + auto Mask = Op->getOperand(0); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + MVT MaskTy = getContainerForFixedLengthMask(ContainerVT); + + // Combine the select condition with the fixed VL mask. + SDValue CombinedMask = DAG.getNode(ISD::AND, DL, MaskTy, Pg, + convertToScalableMask(DAG, MaskTy, Mask)); + SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); + SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); + + auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, + CombinedMask, Op1, Op2); + + return convertFromScalableVector(DAG, VT, ScalableRes); + } + + assert(VT.isScalableVector() && "Only expect to lower scalable vector select!"); + return SDValue(); +} Index: llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -0,0 +1,176 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define void @select_v4f16(<4 x half>* %a, <4 x half>* %b, <4 x i1>* %c) #0 { + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x half>, <4 x half>* %a + %op2 = load <4 x half>, <4 x half>* %b + %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2 + store <4 x half> %sel, <4 x half>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @select_v8f16(<8 x half>* %a, <8 x half>* %b, <8 x i1>* %c) #0 { + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x half>, <8 x half>* %a + %op2 = load <8 x half>, <8 x half>* %b + %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2 + store <8 x half> %sel, <8 x half>* %a + ret void +} + +define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i1>* %c) #0 { + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %sel = select <16 x i1> %mask, <16 x half> %op1, <16 x half> %op2 + store <16 x half> %sel, <16 x half>* %a + ret void +} + +define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i1>* %c) #0 { + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x half>, <32 x half>* %a + %op2 = load <32 x half>, <32 x half>* %b + %sel = select <32 x i1> %mask, <32 x half> %op1, <32 x half> %op2 + store <32 x half> %sel, <32 x half>* %a + ret void +} + +define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i1>* %c) #0 { + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x half>, <64 x half>* %a + %op2 = load <64 x half>, <64 x half>* %b + %sel = select <64 x i1> %mask, <64 x half> %op1, <64 x half> %op2 + store <64 x half> %sel, <64 x half>* %a + ret void +} + +define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i1>* %c) #0 { + %mask = load <128 x i1>, <128 x i1>* %c + %op1 = load <128 x half>, <128 x half>* %a + %op2 = load <128 x half>, <128 x half>* %b + %sel = select <128 x i1> %mask, <128 x half> %op1, <128 x half> %op2 + store <128 x half> %sel, <128 x half>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define void @select_v2f32(<2 x float>* %a, <2 x float>* %b, <2 x i1>* %c) #0 { + %mask = load <2 x i1>, <2 x i1>* %c + %op1 = load <2 x float>, <2 x float>* %a + %op2 = load <2 x float>, <2 x float>* %b + %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2 + store <2 x float> %sel, <2 x float>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @select_v4f32(<4 x float>* %a, <4 x float>* %b, <4 x i1>* %c) #0 { + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x float>, <4 x float>* %a + %op2 = load <4 x float>, <4 x float>* %b + %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2 + store <4 x float> %sel, <4 x float>* %a + ret void +} + +define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i1>* %c) #0 { + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x float>, <8 x float>* %a + %op2 = load <8 x float>, <8 x float>* %b + %sel = select <8 x i1> %mask, <8 x float> %op1, <8 x float> %op2 + store <8 x float> %sel, <8 x float>* %a + ret void +} + +define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i1>* %c) #0 { + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x float>, <16 x float>* %a + %op2 = load <16 x float>, <16 x float>* %b + %sel = select <16 x i1> %mask, <16 x float> %op1, <16 x float> %op2 + store <16 x float> %sel, <16 x float>* %a + ret void +} + +define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i1>* %c) #0 { + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x float>, <32 x float>* %a + %op2 = load <32 x float>, <32 x float>* %b + %sel = select <32 x i1> %mask, <32 x float> %op1, <32 x float> %op2 + store <32 x float> %sel, <32 x float>* %a + ret void +} + +define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i1>* %c) #0 { + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x float>, <64 x float>* %a + %op2 = load <64 x float>, <64 x float>* %b + %sel = select <64 x i1> %mask, <64 x float> %op1, <64 x float> %op2 + store <64 x float> %sel, <64 x float>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define void @select_v1f64(<1 x double>* %a, <1 x double>* %b, <1 x i1>* %c) #0 { + %mask = load <1 x i1>, <1 x i1>* %c + %op1 = load <1 x double>, <1 x double>* %a + %op2 = load <1 x double>, <1 x double>* %b + %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2 + store <1 x double> %sel, <1 x double>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @select_v2f64(<2 x double>* %a, <2 x double>* %b, <2 x i1>* %c) #0 { + %mask = load <2 x i1>, <2 x i1>* %c + %op1 = load <2 x double>, <2 x double>* %a + %op2 = load <2 x double>, <2 x double>* %b + %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2 + store <2 x double> %sel, <2 x double>* %a + ret void +} + +define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i1>* %c) #0 { + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x double>, <4 x double>* %a + %op2 = load <4 x double>, <4 x double>* %b + %sel = select <4 x i1> %mask, <4 x double> %op1, <4 x double> %op2 + store <4 x double> %sel, <4 x double>* %a + ret void +} + +define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i1>* %c) #0 { + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x double>, <8 x double>* %a + %op2 = load <8 x double>, <8 x double>* %b + %sel = select <8 x i1> %mask, <8 x double> %op1, <8 x double> %op2 + store <8 x double> %sel, <8 x double>* %a + ret void +} + +define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i1>* %c) #0 { + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x double>, <16 x double>* %a + %op2 = load <16 x double>, <16 x double>* %b + %sel = select <16 x i1> %mask, <16 x double> %op1, <16 x double> %op2 + store <16 x double> %sel, <16 x double>* %a + ret void +} + +define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i1>* %c) #0 { + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x double>, <32 x double>* %a + %op2 = load <32 x double>, <32 x double>* %b + %sel = select <32 x i1> %mask, <32 x double> %op1, <32 x double> %op2 + store <32 x double> %sel, <32 x double>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -0,0 +1,255 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define void @select_v8i8(<8 x i8>* %a, <8 x i8>* %b, <8 x i1>* %c) #0 { +; + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x i8>, <8 x i8>* %a + %op2 = load <8 x i8>, <8 x i8>* %b + %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2 + store <8 x i8> %sel, <8 x i8>* %a + ret void +} + +define void @select_v16i8(<16 x i8>* %a, <16 x i8>* %b, <16 x i1>* %c) #0 { +; + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x i8>, <16 x i8>* %a + %op2 = load <16 x i8>, <16 x i8>* %b + %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2 + store <16 x i8> %sel, <16 x i8>* %a + ret void +} + +define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, <32 x i1>* %c) #0 { +; + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %sel = select <32 x i1> %mask, <32 x i8> %op1, <32 x i8> %op2 + store <32 x i8> %sel, <32 x i8>* %a + ret void +} + +define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, <64 x i1>* %c) #0 { +; + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %sel = select <64 x i1> %mask, <64 x i8> %op1, <64 x i8> %op2 + store <64 x i8> %sel, <64 x i8>* %a + ret void +} + +define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, <128 x i1>* %c) #0 { +; + %mask = load <128 x i1>, <128 x i1>* %c + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %sel = select <128 x i1> %mask, <128 x i8> %op1, <128 x i8> %op2 + store <128 x i8> %sel, <128 x i8>* %a + ret void +} + +define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, <256 x i1>* %c) #0 { +; + %mask = load <256 x i1>, <256 x i1>* %c + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %sel = select <256 x i1> %mask, <256 x i8> %op1, <256 x i8> %op2 + store <256 x i8> %sel, <256 x i8>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define void @select_v4i16(<4 x i16>* %a, <4 x i16>* %b, <4 x i1>* %c) #0 { +; + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x i16>, <4 x i16>* %a + %op2 = load <4 x i16>, <4 x i16>* %b + %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2 + store <4 x i16> %sel, <4 x i16>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @select_v8i16(<8 x i16>* %a, <8 x i16>* %b, <8 x i1>* %c) #0 { +; + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x i16>, <8 x i16>* %a + %op2 = load <8 x i16>, <8 x i16>* %b + %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2 + store <8 x i16> %sel, <8 x i16>* %a + ret void +} + +define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, <16 x i1>* %c) #0 { +; + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %sel = select <16 x i1> %mask, <16 x i16> %op1, <16 x i16> %op2 + store <16 x i16> %sel, <16 x i16>* %a + ret void +} + +define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i1>* %c) #0 { +; + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %sel = select <32 x i1> %mask, <32 x i16> %op1, <32 x i16> %op2 + store <32 x i16> %sel, <32 x i16>* %a + ret void +} + +define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, <64 x i1>* %c) #0 { +; + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %sel = select <64 x i1> %mask, <64 x i16> %op1, <64 x i16> %op2 + store <64 x i16> %sel, <64 x i16>* %a + ret void +} + +define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, <128 x i1>* %c) #0 { +; + %mask = load <128 x i1>, <128 x i1>* %c + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %sel = select <128 x i1> %mask, <128 x i16> %op1, <128 x i16> %op2 + store <128 x i16> %sel, <128 x i16>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define void @select_v2i32(<2 x i32>* %a, <2 x i32>* %b, <2 x i1>* %c) #0 { +; + %mask = load <2 x i1>, <2 x i1>* %c + %op1 = load <2 x i32>, <2 x i32>* %a + %op2 = load <2 x i32>, <2 x i32>* %b + %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2 + store <2 x i32> %sel, <2 x i32>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @select_v4i32(<4 x i32>* %a, <4 x i32>* %b, <4 x i1>* %c) #0 { +; + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x i32>, <4 x i32>* %a + %op2 = load <4 x i32>, <4 x i32>* %b + %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2 + store <4 x i32> %sel, <4 x i32>* %a + ret void +} + +define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, <8 x i1>* %c) #0 { +; + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %sel = select <8 x i1> %mask, <8 x i32> %op1, <8 x i32> %op2 + store <8 x i32> %sel, <8 x i32>* %a + ret void +} + +define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, <16 x i1>* %c) #0 { +; + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %sel = select <16 x i1> %mask, <16 x i32> %op1, <16 x i32> %op2 + store <16 x i32> %sel, <16 x i32>* %a + ret void +} + +define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, <32 x i1>* %c) #0 { +; + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %sel = select <32 x i1> %mask, <32 x i32> %op1, <32 x i32> %op2 + store <32 x i32> %sel, <32 x i32>* %a + ret void +} + +define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, <64 x i1>* %c) #0 { +; + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %sel = select <64 x i1> %mask, <64 x i32> %op1, <64 x i32> %op2 + store <64 x i32> %sel, <64 x i32>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define void @select_v1i64(<1 x i64>* %a, <1 x i64>* %b, <1 x i1>* %c) #0 { +; + %mask = load <1 x i1>, <1 x i1>* %c + %op1 = load <1 x i64>, <1 x i64>* %a + %op2 = load <1 x i64>, <1 x i64>* %b + %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2 + store <1 x i64> %sel, <1 x i64>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @select_v2i64(<2 x i64>* %a, <2 x i64>* %b, <2 x i1>* %c) #0 { +; + %mask = load <2 x i1>, <2 x i1>* %c + %op1 = load <2 x i64>, <2 x i64>* %a + %op2 = load <2 x i64>, <2 x i64>* %b + %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2 + store <2 x i64> %sel, <2 x i64>* %a + ret void +} + +define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, <4 x i1>* %c) #0 { +; + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %sel = select <4 x i1> %mask, <4 x i64> %op1, <4 x i64> %op2 + store <4 x i64> %sel, <4 x i64>* %a + ret void +} + +define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, <8 x i1>* %c) #0 { +; + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %sel = select <8 x i1> %mask, <8 x i64> %op1, <8 x i64> %op2 + store <8 x i64> %sel, <8 x i64>* %a + ret void +} + +define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, <16 x i1>* %c) #0 { +; + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %sel = select <16 x i1> %mask, <16 x i64> %op1, <16 x i64> %op2 + store <16 x i64> %sel, <16 x i64>* %a + ret void +} + +define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, <32 x i1>* %c) #0 { +; + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %sel = select <32 x i1> %mask, <32 x i64> %op1, <32 x i64> %op2 + store <32 x i64> %sel, <32 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" }