Skip to content

Commit d7bae45

Browse files
author
Igor Breger
committedOct 15, 2015
AVX512: Implemented DAG lowering for shuff62x2/shufi62x2 instructions ( shuffle packed values at 128-bit granularity )
Differential Revision: http://reviews.llvm.org/D13648 llvm-svn: 250400
1 parent 59e569b commit d7bae45

File tree

8 files changed

+406
-121
lines changed

8 files changed

+406
-121
lines changed
 

‎llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp

+64-1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,51 @@ static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
107107
}
108108
}
109109

110+
#define CASE_VSHUF_COMMON(Inst, Suffix, src2) \
111+
case X86::VSHUFF##Inst##Suffix##r##src2##i: \
112+
case X86::VSHUFF##Inst##Suffix##r##src2##ik: \
113+
case X86::VSHUFF##Inst##Suffix##r##src2##ikz: \
114+
case X86::VSHUFI##Inst##Suffix##r##src2##i: \
115+
case X86::VSHUFI##Inst##Suffix##r##src2##ik: \
116+
case X86::VSHUFI##Inst##Suffix##r##src2##ikz:
117+
118+
#define CASE_VSHUF(Inst) \
119+
CASE_VSHUF_COMMON(Inst, Z, r) \
120+
CASE_VSHUF_COMMON(Inst, Z, m) \
121+
CASE_VSHUF_COMMON(Inst, Z256, r) \
122+
CASE_VSHUF_COMMON(Inst, Z256, m) \
123+
124+
/// \brief Extracts the types and if it has memory operand for a given
125+
/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
126+
static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
127+
HasMemOp = false;
128+
switch (MI->getOpcode()) {
129+
default:
130+
llvm_unreachable("Unknown VSHUF64x2 family instructions.");
131+
break;
132+
CASE_VSHUF_COMMON(64X2, Z, m)
133+
HasMemOp = true; // FALL THROUGH.
134+
CASE_VSHUF_COMMON(64X2, Z, r)
135+
VT = MVT::v8i64;
136+
break;
137+
CASE_VSHUF_COMMON(64X2, Z256, m)
138+
HasMemOp = true; // FALL THROUGH.
139+
CASE_VSHUF_COMMON(64X2, Z256, r)
140+
VT = MVT::v4i64;
141+
break;
142+
CASE_VSHUF_COMMON(32X4, Z, m)
143+
HasMemOp = true; // FALL THROUGH.
144+
CASE_VSHUF_COMMON(32X4, Z, r)
145+
VT = MVT::v16i32;
146+
break;
147+
CASE_VSHUF_COMMON(32X4, Z256, m)
148+
HasMemOp = true; // FALL THROUGH.
149+
CASE_VSHUF_COMMON(32X4, Z256, r)
150+
VT = MVT::v8i32;
151+
break;
152+
}
153+
}
154+
110155
//===----------------------------------------------------------------------===//
111156
// Top Level Entrypoint
112157
//===----------------------------------------------------------------------===//
@@ -726,7 +771,25 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
726771
Src1Name = getRegName(MI->getOperand(1).getReg());
727772
DestName = getRegName(MI->getOperand(0).getReg());
728773
break;
729-
774+
CASE_VSHUF(64X2)
775+
CASE_VSHUF(32X4) {
776+
MVT VT;
777+
bool HasMemOp;
778+
unsigned NumOp = MI->getNumOperands();
779+
getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
780+
decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
781+
ShuffleMask);
782+
DestName = getRegName(MI->getOperand(0).getReg());
783+
if (HasMemOp) {
784+
assert((NumOp >= 8) && "Expected at least 8 operands!");
785+
Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
786+
} else {
787+
assert((NumOp >= 4) && "Expected at least 4 operands!");
788+
Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
789+
Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
790+
}
791+
break;
792+
}
730793
case X86::UNPCKLPDrr:
731794
case X86::VUNPCKLPDrr:
732795
Src2Name = getRegName(MI->getOperand(2).getReg());

‎llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
264264
}
265265
}
266266

267+
/// \brief Decode a shuffle packed values at 128-bit granularity
268+
/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
269+
/// immediate mask into a shuffle mask.
270+
void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
271+
SmallVectorImpl<int> &ShuffleMask) {
272+
unsigned NumLanes = VT.getSizeInBits() / 128;
273+
unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
274+
unsigned ControlBitsMask = NumLanes - 1;
275+
unsigned NumControlBits = NumLanes / 2;
276+
277+
for (unsigned l = 0; l != NumLanes; ++l) {
278+
unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
279+
// We actually need the other source.
280+
if (l >= NumLanes / 2)
281+
LaneMask += NumLanes;
282+
for (unsigned i = 0; i != NumElementsInLane; ++i)
283+
ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
284+
}
285+
}
286+
267287
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
268288
SmallVectorImpl<int> &ShuffleMask) {
269289
unsigned HalfSize = VT.getVectorNumElements() / 2;

‎llvm/lib/Target/X86/Utils/X86ShuffleDecode.h

+5
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
8686
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
8787
SmallVectorImpl<int> &ShuffleMask);
8888

89+
/// \brief Decode a shuffle packed values at 128-bit granularity
90+
/// immediate mask into a shuffle mask.
91+
void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
92+
SmallVectorImpl<int> &ShuffleMask);
93+
8994
/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
9095
/// No VT provided since it only works on 256-bit, 4 element vectors.
9196
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+44
Original file line numberDiff line numberDiff line change
@@ -10747,6 +10747,42 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
1074710747
}
1074810748
}
1074910749

10750+
/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
10751+
static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
10752+
ArrayRef<int> Mask,
10753+
SDValue V1, SDValue V2,
10754+
SelectionDAG &DAG) {
10755+
assert(VT.getScalarSizeInBits() == 64 &&
10756+
"Unexpected element type size for 128bit shuffle.");
10757+
10758+
// To handle 256 bit vector requires VLX and most probably
10759+
// function lowerV2X128VectorShuffle() is better solution.
10760+
assert(VT.getSizeInBits() == 512 &&
10761+
"Unexpected vector size for 128bit shuffle.");
10762+
10763+
SmallVector<int, 4> WidenedMask;
10764+
if (!canWidenShuffleElements(Mask, WidenedMask))
10765+
return SDValue();
10766+
10767+
// Form a 128-bit permutation.
10768+
// Convert the 64-bit shuffle mask selection values into 128-bit selection
10769+
// bits defined by a vshuf64x2 instruction's immediate control byte.
10770+
unsigned PermMask = 0, Imm = 0;
10771+
unsigned ControlBitsNum = WidenedMask.size() / 2;
10772+
10773+
for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
10774+
if (WidenedMask[i] == SM_SentinelZero)
10775+
return SDValue();
10776+
10777+
// Use first element in place of undef mask.
10778+
Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
10779+
PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
10780+
}
10781+
10782+
return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
10783+
DAG.getConstant(PermMask, DL, MVT::i8));
10784+
}
10785+
1075010786
static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
1075110787
ArrayRef<int> Mask, SDValue V1,
1075210788
SDValue V2, SelectionDAG &DAG) {
@@ -10774,6 +10810,10 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
1077410810
ArrayRef<int> Mask = SVOp->getMask();
1077510811
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
1077610812

10813+
if (SDValue Shuf128 =
10814+
lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
10815+
return Shuf128;
10816+
1077710817
if (SDValue Unpck =
1077810818
lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
1077910819
return Unpck;
@@ -10810,6 +10850,10 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
1081010850
ArrayRef<int> Mask = SVOp->getMask();
1081110851
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
1081210852

10853+
if (SDValue Shuf128 =
10854+
lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
10855+
return Shuf128;
10856+
1081310857
if (SDValue Unpck =
1081410858
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
1081510859
return Unpck;

‎llvm/test/CodeGen/X86/avx512-intrinsics.ll

+9
Original file line numberDiff line numberDiff line change
@@ -4162,7 +4162,9 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x
41624162
; CHECK: ## BB#0:
41634163
; CHECK-NEXT: kmovw %edi, %k1
41644164
; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1}
4165+
; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
41654166
; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm0
4167+
; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
41664168
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
41674169
; CHECK-NEXT: retq
41684170
%res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
@@ -4179,8 +4181,11 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x d
41794181
; CHECK-NEXT: movzbl %dil, %eax
41804182
; CHECK-NEXT: kmovw %eax, %k1
41814183
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1}
4184+
; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
41824185
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 {%k1} {z}
4186+
; CHECK-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
41834187
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0
4188+
; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
41844189
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
41854190
; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
41864191
; CHECK-NEXT: retq
@@ -4200,7 +4205,9 @@ define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32>
42004205
; CHECK: ## BB#0:
42014206
; CHECK-NEXT: kmovw %edi, %k1
42024207
; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1}
4208+
; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
42034209
; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm0
4210+
; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
42044211
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
42054212
; CHECK-NEXT: retq
42064213
%res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
@@ -4217,7 +4224,9 @@ define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x
42174224
; CHECK-NEXT: movzbl %dil, %eax
42184225
; CHECK-NEXT: kmovw %eax, %k1
42194226
; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1}
4227+
; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
42204228
; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0
4229+
; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
42214230
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
42224231
; CHECK-NEXT: retq
42234232
%res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)

0 commit comments

Comments
 (0)
Please sign in to comment.