Skip to content

Commit d7bae45

Browse files
author
Igor Breger
committedOct 15, 2015
AVX512: Implemented DAG lowering for shuff62x2/shufi62x2 instructions ( shuffle packed values at 128-bit granularity )
Differential Revision: http://reviews.llvm.org/D13648 llvm-svn: 250400
1 parent 59e569b commit d7bae45

File tree

8 files changed

+406
-121
lines changed

8 files changed

+406
-121
lines changed
 

‎llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp

+64-1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,51 @@ static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
107107
}
108108
}
109109

110+
#define CASE_VSHUF_COMMON(Inst, Suffix, src2) \
111+
case X86::VSHUFF##Inst##Suffix##r##src2##i: \
112+
case X86::VSHUFF##Inst##Suffix##r##src2##ik: \
113+
case X86::VSHUFF##Inst##Suffix##r##src2##ikz: \
114+
case X86::VSHUFI##Inst##Suffix##r##src2##i: \
115+
case X86::VSHUFI##Inst##Suffix##r##src2##ik: \
116+
case X86::VSHUFI##Inst##Suffix##r##src2##ikz:
117+
118+
#define CASE_VSHUF(Inst) \
119+
CASE_VSHUF_COMMON(Inst, Z, r) \
120+
CASE_VSHUF_COMMON(Inst, Z, m) \
121+
CASE_VSHUF_COMMON(Inst, Z256, r) \
122+
CASE_VSHUF_COMMON(Inst, Z256, m) \
123+
124+
/// \brief Extracts the types and if it has memory operand for a given
125+
/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
126+
static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
127+
HasMemOp = false;
128+
switch (MI->getOpcode()) {
129+
default:
130+
llvm_unreachable("Unknown VSHUF64x2 family instructions.");
131+
break;
132+
CASE_VSHUF_COMMON(64X2, Z, m)
133+
HasMemOp = true; // FALL THROUGH.
134+
CASE_VSHUF_COMMON(64X2, Z, r)
135+
VT = MVT::v8i64;
136+
break;
137+
CASE_VSHUF_COMMON(64X2, Z256, m)
138+
HasMemOp = true; // FALL THROUGH.
139+
CASE_VSHUF_COMMON(64X2, Z256, r)
140+
VT = MVT::v4i64;
141+
break;
142+
CASE_VSHUF_COMMON(32X4, Z, m)
143+
HasMemOp = true; // FALL THROUGH.
144+
CASE_VSHUF_COMMON(32X4, Z, r)
145+
VT = MVT::v16i32;
146+
break;
147+
CASE_VSHUF_COMMON(32X4, Z256, m)
148+
HasMemOp = true; // FALL THROUGH.
149+
CASE_VSHUF_COMMON(32X4, Z256, r)
150+
VT = MVT::v8i32;
151+
break;
152+
}
153+
}
154+
110155
//===----------------------------------------------------------------------===//
111156
// Top Level Entrypoint
112157
//===----------------------------------------------------------------------===//
@@ -726,7 +771,25 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
726771
Src1Name = getRegName(MI->getOperand(1).getReg());
727772
DestName = getRegName(MI->getOperand(0).getReg());
728773
break;
729-
774+
CASE_VSHUF(64X2)
775+
CASE_VSHUF(32X4) {
776+
MVT VT;
777+
bool HasMemOp;
778+
unsigned NumOp = MI->getNumOperands();
779+
getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
780+
decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
781+
ShuffleMask);
782+
DestName = getRegName(MI->getOperand(0).getReg());
783+
if (HasMemOp) {
784+
assert((NumOp >= 8) && "Expected at least 8 operands!");
785+
Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
786+
} else {
787+
assert((NumOp >= 4) && "Expected at least 4 operands!");
788+
Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
789+
Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
790+
}
791+
break;
792+
}
730793
case X86::UNPCKLPDrr:
731794
case X86::VUNPCKLPDrr:
732795
Src2Name = getRegName(MI->getOperand(2).getReg());

‎llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
264264
}
265265
}
266266

267+
/// \brief Decode a shuffle packed values at 128-bit granularity
268+
/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
269+
/// immediate mask into a shuffle mask.
270+
void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
271+
SmallVectorImpl<int> &ShuffleMask) {
272+
unsigned NumLanes = VT.getSizeInBits() / 128;
273+
unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
274+
unsigned ControlBitsMask = NumLanes - 1;
275+
unsigned NumControlBits = NumLanes / 2;
276+
277+
for (unsigned l = 0; l != NumLanes; ++l) {
278+
unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
279+
// We actually need the other source.
280+
if (l >= NumLanes / 2)
281+
LaneMask += NumLanes;
282+
for (unsigned i = 0; i != NumElementsInLane; ++i)
283+
ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
284+
}
285+
}
286+
267287
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
268288
SmallVectorImpl<int> &ShuffleMask) {
269289
unsigned HalfSize = VT.getVectorNumElements() / 2;

‎llvm/lib/Target/X86/Utils/X86ShuffleDecode.h

+5
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
8686
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
8787
SmallVectorImpl<int> &ShuffleMask);
8888

89+
/// \brief Decode a shuffle packed values at 128-bit granularity
90+
/// immediate mask into a shuffle mask.
91+
void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
92+
SmallVectorImpl<int> &ShuffleMask);
93+
8994
/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
9095
/// No VT provided since it only works on 256-bit, 4 element vectors.
9196
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+44
Original file line numberDiff line numberDiff line change
@@ -10747,6 +10747,42 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
1074710747
}
1074810748
}
1074910749

10750+
/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
10751+
static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
10752+
ArrayRef<int> Mask,
10753+
SDValue V1, SDValue V2,
10754+
SelectionDAG &DAG) {
10755+
assert(VT.getScalarSizeInBits() == 64 &&
10756+
"Unexpected element type size for 128bit shuffle.");
10757+
10758+
// To handle 256 bit vector requires VLX and most probably
10759+
// function lowerV2X128VectorShuffle() is better solution.
10760+
assert(VT.getSizeInBits() == 512 &&
10761+
"Unexpected vector size for 128bit shuffle.");
10762+
10763+
SmallVector<int, 4> WidenedMask;
10764+
if (!canWidenShuffleElements(Mask, WidenedMask))
10765+
return SDValue();
10766+
10767+
// Form a 128-bit permutation.
10768+
// Convert the 64-bit shuffle mask selection values into 128-bit selection
10769+
// bits defined by a vshuf64x2 instruction's immediate control byte.
10770+
unsigned PermMask = 0, Imm = 0;
10771+
unsigned ControlBitsNum = WidenedMask.size() / 2;
10772+
10773+
for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
10774+
if (WidenedMask[i] == SM_SentinelZero)
10775+
return SDValue();
10776+
10777+
// Use first element in place of undef mask.
10778+
Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
10779+
PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
10780+
}
10781+
10782+
return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
10783+
DAG.getConstant(PermMask, DL, MVT::i8));
10784+
}
10785+
1075010786
static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
1075110787
ArrayRef<int> Mask, SDValue V1,
1075210788
SDValue V2, SelectionDAG &DAG) {
@@ -10774,6 +10810,10 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
1077410810
ArrayRef<int> Mask = SVOp->getMask();
1077510811
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
1077610812

10813+
if (SDValue Shuf128 =
10814+
lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
10815+
return Shuf128;
10816+
1077710817
if (SDValue Unpck =
1077810818
lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
1077910819
return Unpck;
@@ -10810,6 +10850,10 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
1081010850
ArrayRef<int> Mask = SVOp->getMask();
1081110851
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
1081210852

10853+
if (SDValue Shuf128 =
10854+
lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
10855+
return Shuf128;
10856+
1081310857
if (SDValue Unpck =
1081410858
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
1081510859
return Unpck;

‎llvm/test/CodeGen/X86/avx512-intrinsics.ll

+9
Original file line numberDiff line numberDiff line change
@@ -4162,7 +4162,9 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x
41624162
; CHECK: ## BB#0:
41634163
; CHECK-NEXT: kmovw %edi, %k1
41644164
; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1}
4165+
; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
41654166
; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm0
4167+
; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
41664168
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
41674169
; CHECK-NEXT: retq
41684170
%res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
@@ -4179,8 +4181,11 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x d
41794181
; CHECK-NEXT: movzbl %dil, %eax
41804182
; CHECK-NEXT: kmovw %eax, %k1
41814183
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1}
4184+
; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
41824185
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 {%k1} {z}
4186+
; CHECK-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
41834187
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0
4188+
; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
41844189
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
41854190
; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
41864191
; CHECK-NEXT: retq
@@ -4200,7 +4205,9 @@ define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32>
42004205
; CHECK: ## BB#0:
42014206
; CHECK-NEXT: kmovw %edi, %k1
42024207
; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1}
4208+
; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
42034209
; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm0
4210+
; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
42044211
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
42054212
; CHECK-NEXT: retq
42064213
%res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
@@ -4217,7 +4224,9 @@ define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x
42174224
; CHECK-NEXT: movzbl %dil, %eax
42184225
; CHECK-NEXT: kmovw %eax, %k1
42194226
; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1}
4227+
; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
42204228
; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0
4229+
; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
42214230
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
42224231
; CHECK-NEXT: retq
42234232
%res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)

‎llvm/test/CodeGen/X86/avx512vl-intrinsics.ll

+125-107
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll

+137-9
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,12 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
139139
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
140140
; AVX512F-LABEL: shuffle_v8f64_01014545:
141141
; AVX512F: # BB#0:
142-
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5]
143-
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
142+
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
144143
; AVX512F-NEXT: retq
145144
;
146145
; AVX512F-32-LABEL: shuffle_v8f64_01014545:
147146
; AVX512F-32: # BB#0:
148-
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,4,0,5,0,4,0,5,0]
149-
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
147+
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
150148
; AVX512F-32-NEXT: retl
151149
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
152150
ret <8 x double> %shuffle
@@ -1122,18 +1120,16 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
11221120
}
11231121

11241122
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
1125-
;
11261123
; AVX512F-LABEL: shuffle_v8i64_01014545:
11271124
; AVX512F: # BB#0:
1128-
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5]
1129-
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
1125+
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
11301126
; AVX512F-NEXT: retq
11311127
;
11321128
; AVX512F-32-LABEL: shuffle_v8i64_01014545:
11331129
; AVX512F-32: # BB#0:
1134-
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,4,0,5,0,4,0,5,0]
1135-
; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
1130+
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
11361131
; AVX512F-32-NEXT: retl
1132+
11371133
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
11381134
ret <8 x i64> %shuffle
11391135
}
@@ -2010,3 +2006,135 @@ define <8 x i64> @shuffle_v8i64_193b5d7f(<8 x i64> %a, <8 x i64> %b) {
20102006
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
20112007
ret <8 x i64> %shuffle
20122008
}
2009+
2010+
define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
2011+
; AVX512F-LABEL: test_vshuff64x2_512:
2012+
; AVX512F: # BB#0:
2013+
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
2014+
; AVX512F-NEXT: retq
2015+
;
2016+
; AVX512F-32-LABEL: test_vshuff64x2_512:
2017+
; AVX512F-32: # BB#0:
2018+
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
2019+
; AVX512F-32-NEXT: retl
2020+
%res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
2021+
ret <8 x double> %res
2022+
}
2023+
2024+
define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
2025+
; AVX512F-LABEL: test_vshuff64x2_512_maskz:
2026+
; AVX512F: # BB#0:
2027+
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
2028+
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2029+
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
2030+
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
2031+
; AVX512F-NEXT: retq
2032+
;
2033+
; AVX512F-32-LABEL: test_vshuff64x2_512_maskz:
2034+
; AVX512F-32: # BB#0:
2035+
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
2036+
; AVX512F-32-NEXT: vpandq .LCPI118_0, %zmm2, %zmm2
2037+
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
2038+
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
2039+
; AVX512F-32-NEXT: retl
2040+
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
2041+
%res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
2042+
ret <8 x double> %res
2043+
}
2044+
2045+
define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
2046+
; AVX512F-LABEL: test_vshufi64x2_512_mask:
2047+
; AVX512F: # BB#0:
2048+
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
2049+
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
2050+
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
2051+
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
2052+
; AVX512F-NEXT: retq
2053+
;
2054+
; AVX512F-32-LABEL: test_vshufi64x2_512_mask:
2055+
; AVX512F-32: # BB#0:
2056+
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
2057+
; AVX512F-32-NEXT: vpandq .LCPI119_0, %zmm2, %zmm2
2058+
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
2059+
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
2060+
; AVX512F-32-NEXT: retl
2061+
%y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
2062+
%res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
2063+
ret <8 x i64> %res
2064+
}
2065+
2066+
define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
2067+
; AVX512F-LABEL: test_vshuff64x2_512_mem:
2068+
; AVX512F: # BB#0:
2069+
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
2070+
; AVX512F-NEXT: retq
2071+
;
2072+
; AVX512F-32-LABEL: test_vshuff64x2_512_mem:
2073+
; AVX512F-32: # BB#0:
2074+
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
2075+
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
2076+
; AVX512F-32-NEXT: retl
2077+
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
2078+
%res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
2079+
ret <8 x double> %res
2080+
}
2081+
2082+
define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
2083+
; AVX512F-LABEL: test_vshuff64x2_512_mem_mask:
2084+
; AVX512F: # BB#0:
2085+
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
2086+
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
2087+
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
2088+
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
2089+
; AVX512F-NEXT: retq
2090+
;
2091+
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
2092+
; AVX512F-32: # BB#0:
2093+
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
2094+
; AVX512F-32-NEXT: vpandq .LCPI121_0, %zmm1, %zmm1
2095+
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
2096+
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
2097+
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
2098+
; AVX512F-32-NEXT: retl
2099+
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
2100+
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
2101+
%res = select <8 x i1> %mask, <8 x double> %y, <8 x double> %x
2102+
ret <8 x double> %res
2103+
}
2104+
2105+
define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
2106+
; AVX512F-LABEL: test_vshuff64x2_512_mem_maskz:
2107+
; AVX512F: # BB#0:
2108+
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
2109+
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
2110+
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
2111+
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
2112+
; AVX512F-NEXT: retq
2113+
;
2114+
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
2115+
; AVX512F-32: # BB#0:
2116+
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
2117+
; AVX512F-32-NEXT: vpandq .LCPI122_0, %zmm1, %zmm1
2118+
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
2119+
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
2120+
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
2121+
; AVX512F-32-NEXT: retl
2122+
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
2123+
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
2124+
%res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
2125+
ret <8 x double> %res
2126+
}
2127+
2128+
define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind {
2129+
; AVX512F-LABEL: test_vshuff32x4_512:
2130+
; AVX512F: # BB#0:
2131+
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
2132+
; AVX512F-NEXT: retq
2133+
;
2134+
; AVX512F-32-LABEL: test_vshuff32x4_512:
2135+
; AVX512F-32: # BB#0:
2136+
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
2137+
; AVX512F-32-NEXT: retl
2138+
%res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
2139+
ret <16 x float> %res
2140+
}

‎llvm/test/CodeGen/X86/vector-shuffle-v1.ll

+2-4
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
213213
; AVX512F-NEXT: movzbl %dil, %eax
214214
; AVX512F-NEXT: kmovw %eax, %k1
215215
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
216-
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u>
217-
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
216+
; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
218217
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
219218
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
220219
; AVX512F-NEXT: kmovw %k0, %eax
@@ -224,8 +223,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
224223
; VL_BW_DQ: # BB#0:
225224
; VL_BW_DQ-NEXT: kmovb %edi, %k0
226225
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
227-
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u>
228-
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
226+
; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
229227
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
230228
; VL_BW_DQ-NEXT: kmovb %k0, %eax
231229
; VL_BW_DQ-NEXT: retq

0 commit comments

Comments
 (0)
Please sign in to comment.