Skip to content

Commit 5e0ceb8

Browse files
committedFeb 26, 2018
[X86] Add a custom legalization for (i16 (bitcast v16i1)) and (i32 (bitcast v32i1)) without AVX512 to prevent scalarization
Summary: We have an early DAG combine to turn these patterns into MOVMSK, but that combine doesn't work if the vXi1 type has more elements than the widest legal vXi8 type. Type legalization will eventually split it down to v16i1 or v32i1 and then the bitcast gets legalized to a truncstore and a scalar load. The truncstore will get lowered to a series of extracts and bit math. This patch adds a custom legalization to use a sign extend and MOVMSK instead. This prevents the eventual scalarization. Reviewers: spatel, RKSimon, zvi Reviewed By: RKSimon Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D43593 llvm-svn: 326119
1 parent 6daad9d commit 5e0ceb8

File tree

4 files changed

+121
-1698
lines changed

4 files changed

+121
-1698
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+35-11
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
883883
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
884884
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
885885
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
886+
if (!Subtarget.hasAVX512())
887+
setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
886888

887889
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
888890
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
@@ -1012,6 +1014,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
10121014
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
10131015
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
10141016

1017+
if (!Subtarget.hasAVX512())
1018+
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1019+
10151020
for (MVT VT : MVT::fp_vector_valuetypes())
10161021
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
10171022

@@ -23740,6 +23745,24 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
2374023745
return SDValue();
2374123746
}
2374223747

23748+
// Create MOVMSKB, taking into account whether we need to split for AVX1.
23749+
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
23750+
const X86Subtarget &Subtarget) {
23751+
MVT InVT = V.getSimpleValueType();
23752+
23753+
if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
23754+
SDValue Lo, Hi;
23755+
std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
23756+
Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
23757+
Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
23758+
Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
23759+
DAG.getConstant(16, DL, MVT::i8));
23760+
return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
23761+
}
23762+
23763+
return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23764+
}
23765+
2374323766
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
2374423767
SelectionDAG &DAG) {
2374523768
SDValue Src = Op.getOperand(0);
@@ -23765,6 +23788,16 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
2376523788
if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector())
2376623789
return Lower512IntUnary(Op, DAG);
2376723790

23791+
// Use MOVMSK for vector to scalar conversion to prevent scalarization.
23792+
if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
23793+
assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
23794+
MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
23795+
SDLoc DL(Op);
23796+
SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
23797+
V = getPMOVMSKB(DL, V, DAG, Subtarget);
23798+
return DAG.getZExtOrTrunc(V, DL, DstVT);
23799+
}
23800+
2376823801
if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
2376923802
SrcVT == MVT::i64) {
2377023803
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
@@ -30648,17 +30681,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
3064830681
SDLoc DL(BitCast);
3064930682
SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
3065030683

30651-
if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30652-
// Handle pre-AVX2 cases by splitting to two v16i1's.
30653-
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30654-
MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
30655-
SDValue Lo = extract128BitVector(V, 0, DAG, DL);
30656-
SDValue Hi = extract128BitVector(V, 16, DAG, DL);
30657-
Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30658-
Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30659-
Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30660-
DAG.getConstant(16, DL, ShiftTy));
30661-
V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30684+
if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
30685+
V = getPMOVMSKB(DL, V, DAG, Subtarget);
3066230686
return DAG.getZExtOrTrunc(V, DL, VT);
3066330687
}
3066430688

0 commit comments

Comments
 (0)