diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44499,7 +44499,8 @@ /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { assert(ML->isUnindexed() && "Unexpected indexed masked load!"); // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform @@ -44516,14 +44517,25 @@ SDLoc DL(ML); EVT VT = ML->getValueType(0); EVT EltVT = VT.getVectorElementType(); + + EVT CastVT = VT; + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + CastVT = + EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements()); + } + SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo().getWithOffset(Offset), Alignment, ML->getMemOperand()->getFlags()); + SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru()); + // Insert the loaded element into the appropriate place in the vector. - SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, - ML->getPassThru(), Load, VecIndex); + SDValue Insert = + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex); + Insert = DAG.getBitcast(VT, Insert); return DCI.CombineTo(ML, Insert, Load.getValue(1), true); } @@ -44586,7 +44598,8 @@ return SDValue(); if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { - if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) + if (SDValue ScalarLoad = + reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget)) return ScalarLoad; // TODO: Do some AVX512 subsets benefit from this transform? @@ -44623,7 +44636,8 @@ /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. @@ -44636,10 +44650,17 @@ // Extract the one scalar element that is actually being stored. SDLoc DL(MS); - EVT VT = MS->getValue().getValueType(); + SDValue Value = MS->getValue(); + EVT VT = Value.getValueType(); EVT EltVT = VT.getVectorElementType(); - SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, - MS->getValue(), VecIndex); + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + EVT CastVT = + EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements()); + Value = DAG.getBitcast(CastVT, Value); + } + SDValue Extract = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex); // Store that element at the appropriate offset from the base pointer. return DAG.getStore(MS->getChain(), DL, Extract, Addr, @@ -44661,7 +44682,7 @@ if (Mst->isTruncatingStore()) return SDValue(); - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) return ScalarStore; // If the mask value has been legalized to a non-boolean vector, try to diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6504,8 +6504,7 @@ ; X86-AVX512-LABEL: mload_constmask_v2i64: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpinsrd $2, 8(%eax), %xmm0, %xmm0 -; X86-AVX512-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-AVX512-NEXT: retl %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> , <2 x i64> %dst) ret <2 x i64> %res @@ -7109,10 +7108,9 @@ ; X86-AVX512-LABEL: load_one_mask_bit_set3: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vpinsrd $0, 16(%eax), %xmm1, %xmm1 -; X86-AVX512-NEXT: vpinsrd $1, 20(%eax), %xmm1, %xmm1 -; X86-AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX512-NEXT: retl %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %val) ret <4 x i64> %res