Skip to content

Commit

Permalink
[X86][SSE] Split large PAVGB/PAVGW vectors to legal widths
Browse files Browse the repository at this point in the history
Patch to allow detectAVGPattern handle vectors larger than the legal size (128 SSE2, 256 AVX2, 512 AVX512BW), splitting the vectors accordingly.

Differential Revision: https://reviews.llvm.org/D41440

llvm-svn: 321288
  • Loading branch information
RKSimon committed Dec 21, 2017
1 parent 6452efd commit 4de5bb0
Showing 3 changed files with 318 additions and 2,323 deletions.
46 changes: 31 additions & 15 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
@@ -33897,16 +33897,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

if (!Subtarget.hasSSE2())
return SDValue();
if (Subtarget.hasBWI()) {
if (VT.getSizeInBits() > 512)
return SDValue();
} else if (Subtarget.hasAVX2()) {
if (VT.getSizeInBits() > 256)
return SDValue();
} else {
if (VT.getSizeInBits() > 128)
return SDValue();
}

// Detect the following pattern:
//
@@ -33918,7 +33908,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
// %6 = trunc <N x i32> %5 to <N x i8>
//
// In AVX512, the last instruction can also be a trunc store.

if (In.getOpcode() != ISD::SRL)
return SDValue();

@@ -33939,6 +33928,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return true;
};

// Split vectors to legal target size and apply AVG.
auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
unsigned NumSubs = 1;
if (Subtarget.hasBWI()) {
if (VT.getSizeInBits() > 512)
NumSubs = VT.getSizeInBits() / 512;
} else if (Subtarget.hasAVX2()) {
if (VT.getSizeInBits() > 256)
NumSubs = VT.getSizeInBits() / 256;
} else {
if (VT.getSizeInBits() > 128)
NumSubs = VT.getSizeInBits() / 128;
}

if (NumSubs == 1)
return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);

SmallVector<SDValue, 4> Subs;
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
VT.getVectorNumElements() / NumSubs);
for (unsigned i = 0; i != NumSubs; ++i) {
unsigned Idx = i * SubVT.getVectorNumElements();
SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
};

// Check if each element of the vector is left-shifted by one.
auto LHS = In.getOperand(0);
auto RHS = In.getOperand(1);
@@ -33962,8 +33980,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
SDValue VecOnes = DAG.getConstant(1, DL, InVT);
Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
Operands[1]);
return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
}

if (Operands[0].getOpcode() == ISD::ADD)
@@ -33987,8 +34004,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return SDValue();

// The pattern is detected, emit X86ISD::AVG instruction.
return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
Operands[1].getOperand(0));
return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
}

return SDValue();
24 changes: 4 additions & 20 deletions llvm/test/CodeGen/X86/avg-mask.ll
Original file line number Diff line number Diff line change
@@ -143,16 +143,8 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7
; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7
; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7
; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
@@ -201,16 +193,8 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
2,571 changes: 283 additions & 2,288 deletions llvm/test/CodeGen/X86/avg.ll

Large diffs are not rendered by default.

0 comments on commit 4de5bb0

Please sign in to comment.