Index: ../include/llvm/IR/Intrinsics.h =================================================================== --- ../include/llvm/IR/Intrinsics.h +++ ../include/llvm/IR/Intrinsics.h @@ -100,7 +100,7 @@ Void, VarArg, MMX, Token, Metadata, Half, Float, Double, Integer, Vector, Pointer, Struct, Argument, ExtendArgument, TruncArgument, HalfVecArgument, - SameVecWidthArgument, PtrToArgument, VecOfPtrsToElt + SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfPtrsToElt } Kind; union { @@ -123,7 +123,7 @@ assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || Kind == SameVecWidthArgument || Kind == PtrToArgument || - Kind == VecOfPtrsToElt); + Kind == PtrToElt || Kind == VecOfPtrsToElt); return Argument_Info >> 3; } ArgKind getArgumentKind() const { Index: ../include/llvm/IR/Intrinsics.td =================================================================== --- ../include/llvm/IR/Intrinsics.td +++ ../include/llvm/IR/Intrinsics.td @@ -133,6 +133,7 @@ ValueType ElTy = elty.VT; } class LLVMPointerTo : LLVMMatchType; +class LLVMPointerToElt : LLVMMatchType; class LLVMVectorOfPointersToElt : LLVMMatchType; // Match the type of another intrinsic parameter that is expected to be a @@ -718,13 +719,25 @@ [LLVMVectorOfPointersToElt<0>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>], - [IntrReadMem]>; + [IntrReadMem]>; def int_masked_scatter: Intrinsic<[], [llvm_anyvector_ty, LLVMVectorOfPointersToElt<0>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>]>; +def int_masked_expandload: Intrinsic<[llvm_anyvector_ty], + [LLVMPointerToElt<0>, + LLVMVectorSameWidth<0, llvm_i1_ty>, + LLVMMatchType<0>], + [IntrReadMem]>; + +def int_masked_compressstore: Intrinsic<[], + [llvm_anyvector_ty, + LLVMPointerToElt<0>, + LLVMVectorSameWidth<0, llvm_i1_ty>], + [IntrArgMemOnly]>; + // Test whether a pointer is associated with a type metadata identifier. def int_type_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty], [IntrNoMem]>; Index: ../lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- ../lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ ../lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5575,7 +5575,7 @@ Alignment, MST->getAAInfo(), MST->getRanges()); Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO, - MST->isTruncatingStore()); + MST->isTruncatingStore(), MST->isCompressingStore()); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -5588,7 +5588,7 @@ MST->getRanges()); Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, - MST->isTruncatingStore()); + MST->isTruncatingStore(), MST->isCompressingStore()); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); Index: ../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- ../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ ../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1212,7 +1212,7 @@ return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), - TruncateStore); + TruncateStore, N->isCompressingStore()); } SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, Index: ../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- ../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ ../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -878,6 +878,8 @@ void visitMaskedStore(const CallInst &I); void visitMaskedGather(const CallInst &I); void visitMaskedScatter(const CallInst &I); + void visitMaskedExpandLoad(const CallInst &I); + void visitMaskedCompressStore(const CallInst &I); void visitAtomicCmpXchg(const AtomicCmpXchgInst &I); void visitAtomicRMW(const AtomicRMWInst &I); void visitFence(const FenceInst &I); Index: ../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- ../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ ../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3689,7 +3689,33 @@ MachineMemOperand::MOStore, VT.getStoreSize(), Alignment, AAInfo); SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT, - MMO, false); + MMO); + DAG.setRoot(StoreNode); + setValue(&I, StoreNode); +} + +void SelectionDAGBuilder::visitMaskedCompressStore(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + + // void llvm.masked.compressstore.*(Src0, Ptr, Mask) + Value *PtrOperand = I.getArgOperand(1); + SDValue Ptr = getValue(PtrOperand); + SDValue Src0 = getValue(I.getArgOperand(0)); + SDValue Mask = getValue(I.getArgOperand(2)); + EVT VT = Src0.getValueType(); + unsigned Alignment = DAG.getEVTAlignment(VT.getVectorElementType()); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + + MachineMemOperand *MMO = + DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(PtrOperand), + MachineMemOperand::MOStore, VT.getStoreSize(), + Alignment, AAInfo); + SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT, + MMO, false, /* truncating */ + true /* compressing */); DAG.setRoot(StoreNode); setValue(&I, StoreNode); } @@ -3821,7 +3847,44 @@ Alignment, AAInfo, Ranges); SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO, - ISD::NON_EXTLOAD, false); + ISD::NON_EXTLOAD, false /* IsExpanding */); + if (AddToChain) { + SDValue OutChain = Load.getValue(1); + DAG.setRoot(OutChain); + } + setValue(&I, Load); +} + +void SelectionDAGBuilder::visitMaskedExpandLoad(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + + // @llvm.masked.expandload.*(Ptr, Mask, Src0) + Value *PtrOperand = I.getArgOperand(0); + SDValue Ptr = getValue(PtrOperand); + SDValue Src0 = getValue(I.getArgOperand(2)); + SDValue Mask = getValue(I.getArgOperand(1)); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + unsigned Alignment = DAG.getEVTAlignment(VT.getVectorElementType()); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); + + // Do not serialize masked loads of constant memory with anything. + bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation( + PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo)); + SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); + + MachineMemOperand *MMO = + DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(PtrOperand), + MachineMemOperand::MOLoad, VT.getStoreSize(), + Alignment, AAInfo, Ranges); + + SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO, + ISD::NON_EXTLOAD, true /* IsExpanding */); if (AddToChain) { SDValue OutChain = Load.getValue(1); DAG.setRoot(OutChain); @@ -5054,6 +5117,12 @@ case Intrinsic::masked_store: visitMaskedStore(I); return nullptr; + case Intrinsic::masked_expandload: + visitMaskedExpandLoad(I); + return nullptr; + case Intrinsic::masked_compressstore: + visitMaskedCompressStore(I); + return nullptr; case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: Index: ../lib/IR/Function.cpp =================================================================== --- ../lib/IR/Function.cpp +++ ../lib/IR/Function.cpp @@ -607,10 +607,11 @@ IIT_HALF_VEC_ARG = 30, IIT_SAME_VEC_WIDTH_ARG = 31, IIT_PTR_TO_ARG = 32, - IIT_VEC_OF_PTRS_TO_ELT = 33, - IIT_I128 = 34, - IIT_V512 = 35, - IIT_V1024 = 36 + IIT_PTR_TO_ELT = 33, + IIT_VEC_OF_PTRS_TO_ELT = 34, + IIT_I128 = 35, + IIT_V512 = 36, + IIT_V1024 = 37 }; @@ -744,6 +745,11 @@ ArgInfo)); return; } + case IIT_PTR_TO_ELT: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToElt, ArgInfo)); + return; + } case IIT_VEC_OF_PTRS_TO_ELT: { unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfPtrsToElt, @@ -870,6 +876,14 @@ Type *Ty = Tys[D.getArgumentNumber()]; return PointerType::getUnqual(Ty); } + case IITDescriptor::PtrToElt: { + Type *Ty = Tys[D.getArgumentNumber()]; + VectorType *VTy = dyn_cast(Ty); + if (!VTy) + llvm_unreachable("Expected an argument of Vector Type"); + Type *EltTy = VTy->getVectorElementType(); + return PointerType::getUnqual(EltTy); + } case IITDescriptor::VecOfPtrsToElt: { Type *Ty = Tys[D.getArgumentNumber()]; VectorType *VTy = dyn_cast(Ty); @@ -1064,6 +1078,16 @@ PointerType *ThisArgType = dyn_cast(Ty); return (!ThisArgType || ThisArgType->getElementType() != ReferenceType); } + case IITDescriptor::PtrToElt: { + if (D.getArgumentNumber() >= ArgTys.size()) + return true; + VectorType * ReferenceType = + dyn_cast (ArgTys[D.getArgumentNumber()]); + PointerType *ThisArgType = dyn_cast(Ty); + + return (!ThisArgType || !ReferenceType || + ThisArgType->getElementType() != ReferenceType->getElementType()); + } case IITDescriptor::VecOfPtrsToElt: { if (D.getArgumentNumber() >= ArgTys.size()) return true; Index: ../lib/Target/X86/X86ISelLowering.cpp =================================================================== --- ../lib/Target/X86/X86ISelLowering.cpp +++ ../lib/Target/X86/X86ISelLowering.cpp @@ -1232,10 +1232,11 @@ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } else { - setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); - setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); - setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); - setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); + } } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); @@ -21940,26 +21941,48 @@ SDValue Mask = N->getMask(); SDLoc dl(Op); + assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && + "Expanding masked load is supported on AVX-512 target only!"); + + assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && + "Expanding masked load is supported for 32 and 64-bit types only!"); + + // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of + // VLX. These types for exp-loads are handled here. + if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4) + return Op; + assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op."); - assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || + assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit - unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); - MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); SDValue Src0 = N->getSrc0(); Src0 = ExtendToType(Src0, WideDataVT, DAG); + + // Mask element has to be i1 + MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); + assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && + "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + + MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + if (MaskEltTy != MVT::i1) + Mask = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask); SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType()); + N->getExtensionType(), + N->isExpandingLoad()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), @@ -21977,10 +22000,20 @@ SDValue Mask = N->getMask(); SDLoc dl(Op); + assert((!N->isCompressingStore() || Subtarget.hasAVX512()) && + "Expanding masked load is supported on AVX-512 target only!"); + + assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && + "Expanding masked load is supported for 32 and 64-bit types only!"); + + // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX. + if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4) + return Op; + assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op."); - assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || + assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."); @@ -21989,12 +22022,22 @@ // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); - MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + + // Mask element has to be i1 + MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); + assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && + "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + + MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + if (MaskEltTy != MVT::i1) + Mask = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), - N->isTruncatingStore()); + N->isTruncatingStore(), N->isCompressingStore()); } static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, @@ -29881,6 +29924,11 @@ TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MaskedLoadSDNode *Mld = cast(N); + + // TODO: Expanding load with constant mask may be optimized as well. + if (Mld->isExpandingLoad()) + return SDValue(); + if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) return ScalarLoad; @@ -29996,6 +30044,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MaskedStoreSDNode *Mst = cast(N); + + if (Mst->isCompressingStore()) + return SDValue(); + if (!Mst->isTruncatingStore()) return reduceMaskedStoreToScalarStore(Mst, DAG); Index: ../lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- ../lib/Target/X86/X86InstrFragmentsSIMD.td +++ ../lib/Target/X86/X86InstrFragmentsSIMD.td @@ -965,28 +965,23 @@ def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86mstore node:$src1, node:$src2, node:$src3), [{ - if (auto *Store = dyn_cast(N)) - return Store->getAlignment() >= 16; - return false; + return cast(N)->getAlignment() >= 16; }]>; def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86mstore node:$src1, node:$src2, node:$src3), [{ - if (auto *Store = dyn_cast(N)) - return Store->getAlignment() >= 32; - return false; + return cast(N)->getAlignment() >= 32; }]>; def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86mstore node:$src1, node:$src2, node:$src3), [{ - if (auto *Store = dyn_cast(N)) - return Store->getAlignment() >= 64; - return false; + return cast(N)->getAlignment() >= 64; }]>; def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return isa(N); + (masked_store node:$src1, node:$src2, node:$src3), [{ + return (!cast(N)->isTruncatingStore()) && + (!cast(N)->isCompressingStore()); }]>; def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), Index: ../test/CodeGen/X86/compress_expand.ll =================================================================== --- ../test/CodeGen/X86/compress_expand.ll +++ ../test/CodeGen/X86/compress_expand.ll @@ -0,0 +1,247 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + + + +define <16 x float> @test1(float* %base) { +; ALL-LABEL: test1: +; ALL: # BB#0: +; ALL-NEXT: movw $-2049, %ax # imm = 0xF7FF +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; ALL-NEXT: retq + %res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test2(float* %base, <16 x float> %src0) { +; ALL-LABEL: test2: +; ALL: # BB#0: +; ALL-NEXT: movw $30719, %ax # imm = 0x77FF +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vexpandps (%rdi), %zmm0 {%k1} +; ALL-NEXT: retq + %res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> , <16 x float> %src0) + ret <16 x float>%res +} + +define <8 x double> @test3(double* %base, <8 x double> %src0, <8 x i1> %mask) { +; SKX-LABEL: test3: +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k1 +; SKX-NEXT: vexpandpd (%rdi), %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test3: +; KNL: # BB#0: +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vexpandpd (%rdi), %zmm0 {%k1} +; KNL-NEXT: retq + %res = call <8 x double> @llvm.masked.expandload.v8f64(double* %base, <8 x i1> %mask, <8 x double> %src0) + ret <8 x double>%res +} + +define <4 x float> @test4(float* %base, <4 x float> %src0) { +; SKX-LABEL: test4: +; SKX: # BB#0: +; SKX-NEXT: movb $7, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test4: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: movw $7, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} +; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq + %res = call <4 x float> @llvm.masked.expandload.v4f32(float* %base, <4 x i1> , <4 x float> %src0) + ret <4 x float>%res +} + +define <2 x i64> @test5(i64* %base, <2 x i64> %src0) { +; SKX-LABEL: test5: +; SKX: # BB#0: +; SKX-NEXT: movb $2, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpexpandq (%rdi), %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test5: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: movb $2, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpexpandq (%rdi), %zmm0 {%k1} +; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq + %res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> , <2 x i64> %src0) + ret <2 x i64>%res +} + +declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>) +declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>) +declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>) +declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>) + +define void @test6(float* %base, <16 x float> %V) { +; ALL-LABEL: test6: +; ALL: # BB#0: +; ALL-NEXT: movw $-2049, %ax # imm = 0xF7FF +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k1} +; ALL-NEXT: retq + call void @llvm.masked.compressstore.v16f32(<16 x float> %V, float* %base, <16 x i1> ) + ret void +} + +define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) { +; SKX-LABEL: test7: +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k1 +; SKX-NEXT: vcompressps %ymm0, (%rdi) {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test7: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k1 +; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} +; KNL-NEXT: retq + call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask) + ret void +} + +define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) { +; SKX-LABEL: test8: +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k1 +; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test8: +; KNL: # BB#0: +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vcompresspd %zmm0, (%rdi) {%k1} +; KNL-NEXT: retq + call void @llvm.masked.compressstore.v8f64(<8 x double> %V, double* %base, <8 x i1> %mask) + ret void +} + +define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) { +; SKX-LABEL: test9: +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k1 +; SKX-NEXT: vpcompressq %zmm0, (%rdi) {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test9: +; KNL: # BB#0: +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1} +; KNL-NEXT: retq + call void @llvm.masked.compressstore.v8i64(<8 x i64> %V, i64* %base, <8 x i1> %mask) + ret void +} + +define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) { +; SKX-LABEL: test10: +; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 +; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test10: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1} +; KNL-NEXT: retq + call void @llvm.masked.compressstore.v4i64(<4 x i64> %V, i64* %base, <4 x i1> %mask) + ret void +} + +define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) { +; SKX-LABEL: test11: +; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 +; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 +; SKX-NEXT: vpcompressq %xmm0, (%rdi) {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test11: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vpsllq $63, %xmm1, %xmm1 +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1} +; KNL-NEXT: retq + call void @llvm.masked.compressstore.v2i64(<2 x i64> %V, i64* %base, <2 x i1> %mask) + ret void +} + +define void @test12(float* %base, <4 x float> %V, <4 x i1> %mask) { +; SKX-LABEL: test12: +; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 +; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test12: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} +; KNL-NEXT: retq + call void @llvm.masked.compressstore.v4f32(<4 x float> %V, float* %base, <4 x i1> %mask) + ret void +} + +declare void @llvm.masked.compressstore.v16f32(<16 x float>, float* , <16 x i1>) +declare void @llvm.masked.compressstore.v8f32(<8 x float>, float* , <8 x i1>) +declare void @llvm.masked.compressstore.v8f64(<8 x double>, double* , <8 x i1>) +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32* , <16 x i1>) +declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32* , <8 x i1>) +declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64* , <8 x i1>) +declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32* , <4 x i1>) +declare void @llvm.masked.compressstore.v4f32(<4 x float>, float* , <4 x i1>) +declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64* , <4 x i1>) +declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64* , <2 x i1>) Index: ../utils/TableGen/CodeGenTarget.cpp =================================================================== --- ../utils/TableGen/CodeGenTarget.cpp +++ ../utils/TableGen/CodeGenTarget.cpp @@ -550,8 +550,7 @@ // overloaded, all the types can be specified directly. assert(((!TyEl->isSubClassOf("LLVMExtendedType") && !TyEl->isSubClassOf("LLVMTruncatedType") && - !TyEl->isSubClassOf("LLVMVectorSameWidth") && - !TyEl->isSubClassOf("LLVMPointerToElt")) || + !TyEl->isSubClassOf("LLVMVectorSameWidth")) || VT == MVT::iAny || VT == MVT::vAny) && "Expected iAny or vAny type"); } else Index: ../utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- ../utils/TableGen/IntrinsicEmitter.cpp +++ ../utils/TableGen/IntrinsicEmitter.cpp @@ -213,10 +213,11 @@ IIT_HALF_VEC_ARG = 30, IIT_SAME_VEC_WIDTH_ARG = 31, IIT_PTR_TO_ARG = 32, - IIT_VEC_OF_PTRS_TO_ELT = 33, - IIT_I128 = 34, - IIT_V512 = 35, - IIT_V1024 = 36 + IIT_PTR_TO_ELT = 33, + IIT_VEC_OF_PTRS_TO_ELT = 34, + IIT_I128 = 35, + IIT_V512 = 36, + IIT_V1024 = 37 }; @@ -277,6 +278,8 @@ Sig.push_back(IIT_PTR_TO_ARG); else if (R->isSubClassOf("LLVMVectorOfPointersToElt")) Sig.push_back(IIT_VEC_OF_PTRS_TO_ELT); + else if (R->isSubClassOf("LLVMPointerToElt")) + Sig.push_back(IIT_PTR_TO_ELT); else Sig.push_back(IIT_ARG); return Sig.push_back((Number << 3) | ArgCodes[Number]);