Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26720,13 +26720,74 @@ SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } -/// PerformMSTORECombine - Resolve truncating stores + +/// If this is a build vector of boolean constants and exactly one of those +/// constants is true, return the operand index of that true element. +/// Otherwise, return -1. +static int getOneTrueElt(const SDValue V) { + // This needs to be a build vector of booleans. + auto *BV = dyn_cast(V); + if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1) + return -1; + + int TrueIndex = -1; + unsigned NumElts = BV->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + const SDValue &Op = BV->getOperand(i); + if (Op.getOpcode() == ISD::UNDEF) + continue; + auto *ConstNode = dyn_cast(Op); + if (!ConstNode) + return -1; + if (ConstNode->getAPIntValue().isAllOnesValue()) { + // If we already found a one, this is too many. + if (TrueIndex >= 0) + return -1; + TrueIndex = i; + } + } + return TrueIndex; +} + +/// If exactly one element of the mask is set for a non-truncating masked store, +/// it is a vector extract and scalar store. +/// Note: It is expected that the degenerate cases of an all-zeros or all-ones +/// mask have already been optimized in IR, so we don't bother with those. +static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, + SelectionDAG &DAG) { + int TrueMaskElt = getOneTrueElt(MS->getMask()); + if (TrueMaskElt < 0) + return SDValue(); + + SDLoc DL(MS); + EVT VT = MS->getValue().getValueType(); + EVT EltVT = VT.getVectorElementType(); + + // Extract the one scalar element that is actually being stored. + SDValue ExtractIndex = DAG.getIntPtrConstant(TrueMaskElt, DL); + SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, + MS->getValue(), ExtractIndex); + + // Store that element at the appropriate offset from the base pointer. + unsigned EltSize = EltVT.getStoreSize(); + unsigned StoreOffset = TrueMaskElt * EltSize; + SDValue StoreOffsetVal = DAG.getIntPtrConstant(StoreOffset, DL); + SDValue BasePtr = MS->getBasePtr(); + SDValue IndexedPtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), + BasePtr, StoreOffsetVal); + return DAG.getStore(MS->getChain(), DL, Extract, IndexedPtr, + MS->getPointerInfo(), MS->isVolatile(), + MS->isNonTemporal(), + MinAlign(MS->getAlignment(), EltSize)); +} + static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MaskedStoreSDNode *Mst = cast(N); if (!Mst->isTruncatingStore()) - return SDValue(); + return reduceMaskedStoreToScalarStore(Mst, DAG); + // Resolve truncating stores. EVT VT = Mst->getValue().getValueType(); unsigned NumElems = VT.getVectorNumElements(); EVT StVT = Mst->getMemoryVT(); Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -991,36 +991,55 @@ ret void } -define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { -; AVX1-LABEL: test22: +; When only one element of the mask is set, reduce to a scalar store. + +define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) { +; AVX1-LABEL: one_mask_bit_set1: ; AVX1: ## BB#0: -; AVX1-NEXT: movl $-1, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq ; -; AVX2-LABEL: test22: +; AVX2-LABEL: one_mask_bit_set1: ; AVX2: ## BB#0: -; AVX2-NEXT: movl $-1, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test22: +; AVX512F-LABEL: one_mask_bit_set1: ; AVX512F: ## BB#0: -; AVX512F-NEXT: movl $-1, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: retq ; -; SKX-LABEL: test22: +; SKX-LABEL: one_mask_bit_set1: ; SKX: ## BB#0: -; SKX-NEXT: movb $1, %al -; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq - %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) + ret void +} + +; Choose a different element to show that the correct address offset is produced. + +define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) { +; AVX1-LABEL: one_mask_bit_set2: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: one_mask_bit_set2: +; AVX2: ## BB#0: +; AVX2-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: one_mask_bit_set2: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX512F-NEXT: retq +; +; SKX-LABEL: one_mask_bit_set2: +; SKX: ## BB#0: +; SKX-NEXT: vextractps $2, %xmm0, 8(%rdi) +; SKX-NEXT: retq + call void @llvm.masked.store.v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) ret void } @@ -1032,6 +1051,7 @@ declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)