Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8205,7 +8205,7 @@ bool llvm::isBitwiseNot(SDValue V) { if (V.getOpcode() != ISD::XOR) return false; - ConstantSDNode *C = isConstOrConstSplat(V.getOperand(1)); + ConstantSDNode *C = isConstOrConstSplat(peekThroughBitcasts(V.getOperand(1))); return C && C->isAllOnesValue(); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -40152,6 +40152,31 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + // For AVX1 only, if we are extracting from a 256-bit and+not (which will + // eventually get combined/lowered into ANDNP), split the 'and' into 128-bit + // ops to avoid the extraction (and likely concatenation before this). We let + // generic combining take over from there to simplify the insert/extract and + // 'not'. + // This pattern emerges during AVX1 legalization. We handle it before lowering + // to avoid complications like splitting constant vector loads. + + // Capture the original wide type in the likely case that we need to bitcast + // back to this type. + EVT VT = N->getValueType(0); + EVT WideVecVT = N->getOperand(0).getValueType(); + SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); + if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && WideVecVT.isSimple() && + WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) { + SDValue WideOp0 = peekThroughBitcasts(WideVec.getOperand(0)); + SDValue WideOp1 = peekThroughBitcasts(WideVec.getOperand(1)); + if (isBitwiseNot(WideOp0) || isBitwiseNot(WideOp1)) { + // extract (and v4i64 X, (not Y)), n --> andnp v2i64 X(n), Y(n) + SDValue Concat = split256IntArith(WideVec, DAG); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, + DAG.getBitcast(WideVecVT, Concat), N->getOperand(1)); + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); Index: test/CodeGen/X86/avx-logic.ll =================================================================== --- test/CodeGen/X86/avx-logic.ll +++ test/CodeGen/X86/avx-logic.ll @@ -342,9 +342,9 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735] +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0