Skip to content

Commit dcbfbb1

Browse files
committedJun 11, 2017
[x86] use vperm2f128 rather than vinsertf128 when there's a chance to fold a 32-byte load
I was looking closer at the x86 test diffs in D33866, and the first change seems like it shouldn't happen in the first place. So this patch will resolve that. Using Agner's tables and AMD docs, vperm2f128 and vinsertf128 have identical timing for any given CPU model, so we should be able to interchange those without affecting perf. But as we can see in some of the diffs here, using vperm2f128 allows load folding, so we should take that opportunity to reduce code size and register pressure. A secondary advantage is making AVX1 and AVX2 codegen more similar. Given that vperm2f128 was introduced with AVX1, we should be selecting it in all of the same situations that we would with AVX2. If there's some reason that an AVX1 CPU would not want to use this instruction, that should be fixed up in a later pass. Differential Revision: https://reviews.llvm.org/D33938 llvm-svn: 305171
1 parent 7ed6cd3 commit dcbfbb1

File tree

3 files changed

+36
-50
lines changed

3 files changed

+36
-50
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+13-9
Original file line numberDiff line numberDiff line change
@@ -12007,18 +12007,22 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1200712007
// subvector.
1200812008
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
1200912009
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12010-
// With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
12010+
// With AVX2, use VPERMQ/VPERMPD to allow memory folding.
1201112011
if (Subtarget.hasAVX2() && V2.isUndef())
1201212012
return SDValue();
1201312013

12014-
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12015-
VT.getVectorNumElements() / 2);
12016-
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12017-
DAG.getIntPtrConstant(0, DL));
12018-
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12019-
OnlyUsesV1 ? V1 : V2,
12020-
DAG.getIntPtrConstant(0, DL));
12021-
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12014+
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12015+
// this will likely become vinsertf128 which can't fold a 256-bit memop.
12016+
if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12017+
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12018+
VT.getVectorNumElements() / 2);
12019+
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12020+
DAG.getIntPtrConstant(0, DL));
12021+
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12022+
OnlyUsesV1 ? V1 : V2,
12023+
DAG.getIntPtrConstant(0, DL));
12024+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12025+
}
1202212026
}
1202312027
}
1202412028

‎llvm/test/CodeGen/X86/avx-vperm2x128.ll

+6-14
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,10 @@ entry:
5050
}
5151

5252
define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
53-
; AVX1-LABEL: shuffle_v8f32_01230123_mem:
54-
; AVX1: ## BB#0: ## %entry
55-
; AVX1-NEXT: vmovaps (%rdi), %ymm0
56-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
57-
; AVX1-NEXT: retq
58-
;
59-
; AVX2-LABEL: shuffle_v8f32_01230123_mem:
60-
; AVX2: ## BB#0: ## %entry
61-
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
62-
; AVX2-NEXT: retq
53+
; ALL-LABEL: shuffle_v8f32_01230123_mem:
54+
; ALL: ## BB#0: ## %entry
55+
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
56+
; ALL-NEXT: retq
6357
entry:
6458
%a = load <8 x float>, <8 x float>* %pa
6559
%b = load <8 x float>, <8 x float>* %pb
@@ -195,17 +189,15 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounw
195189
; AVX1-LABEL: shuffle_v16i16_4501_mem:
196190
; AVX1: ## BB#0: ## %entry
197191
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
198-
; AVX1-NEXT: vmovaps (%rsi), %ymm1
199192
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
200-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
193+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
201194
; AVX1-NEXT: retq
202195
;
203196
; AVX2-LABEL: shuffle_v16i16_4501_mem:
204197
; AVX2: ## BB#0: ## %entry
205198
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
206-
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
207199
; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
208-
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
200+
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
209201
; AVX2-NEXT: retq
210202
entry:
211203
%c = load <16 x i16>, <16 x i16>* %a

‎llvm/test/CodeGen/X86/x86-interleaved-access.ll

+17-27
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
99
; AVX-NEXT: vmovupd 32(%rdi), %ymm1
1010
; AVX-NEXT: vmovupd 64(%rdi), %ymm2
1111
; AVX-NEXT: vmovupd 96(%rdi), %ymm3
12-
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
13-
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
12+
; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
13+
; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
1414
; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
1515
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1616
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
@@ -37,8 +37,8 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
3737
; AVX-NEXT: vmovupd 32(%rdi), %ymm1
3838
; AVX-NEXT: vmovupd 64(%rdi), %ymm2
3939
; AVX-NEXT: vmovupd 96(%rdi), %ymm3
40-
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
41-
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
40+
; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
41+
; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
4242
; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
4343
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
4444
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
@@ -53,25 +53,15 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
5353
}
5454

5555
define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
56-
; AVX1-LABEL: load_factorf64_1:
57-
; AVX1: # BB#0:
58-
; AVX1-NEXT: vmovups (%rdi), %ymm0
59-
; AVX1-NEXT: vmovups 32(%rdi), %ymm1
60-
; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
61-
; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
62-
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
63-
; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0
64-
; AVX1-NEXT: retq
65-
;
66-
; AVX2-LABEL: load_factorf64_1:
67-
; AVX2: # BB#0:
68-
; AVX2-NEXT: vmovupd (%rdi), %ymm0
69-
; AVX2-NEXT: vmovupd 32(%rdi), %ymm1
70-
; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
71-
; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
72-
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
73-
; AVX2-NEXT: vmulpd %ymm0, %ymm0, %ymm0
74-
; AVX2-NEXT: retq
56+
; AVX-LABEL: load_factorf64_1:
57+
; AVX: # BB#0:
58+
; AVX-NEXT: vmovupd (%rdi), %ymm0
59+
; AVX-NEXT: vmovupd 32(%rdi), %ymm1
60+
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
61+
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
62+
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
63+
; AVX-NEXT: vmulpd %ymm0, %ymm0, %ymm0
64+
; AVX-NEXT: retq
7565
%wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
7666
%strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
7767
%strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -86,8 +76,8 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
8676
; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
8777
; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
8878
; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
89-
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
90-
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
79+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
80+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
9181
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
9282
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
9383
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
@@ -113,8 +103,8 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
113103
; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
114104
; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
115105
; AVX2-NEXT: vmovdqu 96(%rdi), %ymm3
116-
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
117-
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
106+
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
107+
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
118108
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
119109
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
120110
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]

0 commit comments

Comments
 (0)
Please sign in to comment.