Skip to content

Commit 19792fb

Browse files
committedMar 10, 2015
[X86, AVX] replace vinsertf128 intrinsics with generic shuffles
We want to replace as much custom x86 shuffling via intrinsics as possible because pushing the code down the generic shuffle optimization path allows for better codegen and less complexity in LLVM. This is the sibling patch for the Clang half of this change: http://reviews.llvm.org/D8088 Differential Revision: http://reviews.llvm.org/D8086 llvm-svn: 231794
1 parent c989506 commit 19792fb

File tree

7 files changed

+98
-124
lines changed

7 files changed

+98
-124
lines changed
 

‎llvm/include/llvm/IR/IntrinsicsX86.td

-13
Original file line numberDiff line numberDiff line change
@@ -1183,19 +1183,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
11831183
def int_x86_avx_vextractf128_si_256 :
11841184
GCCBuiltin<"__builtin_ia32_vextractf128_si256">,
11851185
Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
1186-
1187-
def int_x86_avx_vinsertf128_pd_256 :
1188-
GCCBuiltin<"__builtin_ia32_vinsertf128_pd256">,
1189-
Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
1190-
llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
1191-
def int_x86_avx_vinsertf128_ps_256 :
1192-
GCCBuiltin<"__builtin_ia32_vinsertf128_ps256">,
1193-
Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
1194-
llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
1195-
def int_x86_avx_vinsertf128_si_256 :
1196-
GCCBuiltin<"__builtin_ia32_vinsertf128_si256">,
1197-
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
1198-
llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
11991186
}
12001187

12011188
// Vector convert

‎llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -4956,9 +4956,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
49564956
setValue(&I, Res);
49574957
return nullptr;
49584958
}
4959-
case Intrinsic::x86_avx_vinsertf128_pd_256:
4960-
case Intrinsic::x86_avx_vinsertf128_ps_256:
4961-
case Intrinsic::x86_avx_vinsertf128_si_256:
49624959
case Intrinsic::x86_avx2_vinserti128: {
49634960
EVT DestVT = TLI.getValueType(I.getType());
49644961
EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());

‎llvm/lib/IR/AutoUpgrade.cpp

+51-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
//
88
//===----------------------------------------------------------------------===//
99
//
10-
// This file implements the auto-upgrade helper functions
10+
// This file implements the auto-upgrade helper functions.
11+
// This is where deprecated IR intrinsics and other IR features are updated to
12+
// current specifications.
1113
//
1214
//===----------------------------------------------------------------------===//
1315

@@ -156,6 +158,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
156158
Name.startswith("x86.avx2.pcmpeq.") ||
157159
Name.startswith("x86.avx2.pcmpgt.") ||
158160
Name.startswith("x86.avx.vpermil.") ||
161+
Name == "x86.avx.vinsertf128.pd.256" ||
162+
Name == "x86.avx.vinsertf128.ps.256" ||
163+
Name == "x86.avx.vinsertf128.si.256" ||
159164
Name == "x86.avx.movnt.dq.256" ||
160165
Name == "x86.avx.movnt.pd.256" ||
161166
Name == "x86.avx.movnt.ps.256" ||
@@ -626,6 +631,51 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
626631
}
627632

628633
Rep = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
634+
} else if (Name == "llvm.x86.avx.vinsertf128.pd.256" ||
635+
Name == "llvm.x86.avx.vinsertf128.ps.256" ||
636+
Name == "llvm.x86.avx.vinsertf128.si.256") {
637+
Value *Op0 = CI->getArgOperand(0);
638+
Value *Op1 = CI->getArgOperand(1);
639+
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
640+
VectorType *VecTy = cast<VectorType>(CI->getType());
641+
unsigned NumElts = VecTy->getNumElements();
642+
643+
// Mask off the high bits of the immediate value; hardware ignores those.
644+
Imm = Imm & 1;
645+
646+
// Extend the second operand into a vector that is twice as big.
647+
Value *UndefV = UndefValue::get(Op1->getType());
648+
SmallVector<Constant*, 8> Idxs;
649+
for (unsigned i = 0; i != NumElts; ++i) {
650+
Idxs.push_back(Builder.getInt32(i));
651+
}
652+
Rep = Builder.CreateShuffleVector(Op1, UndefV, ConstantVector::get(Idxs));
653+
654+
// Insert the second operand into the first operand.
655+
656+
// Note that there is no guarantee that instruction lowering will actually
657+
// produce a vinsertf128 instruction for the created shuffles. In
658+
// particular, the 0 immediate case involves no lane changes, so it can
659+
// be handled as a blend.
660+
661+
// Example of shuffle mask for 32-bit elements:
662+
// Imm = 1 <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
663+
// Imm = 0 <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7 >
664+
665+
SmallVector<Constant*, 8> Idxs2;
666+
// The low half of the result is either the low half of the 1st operand
667+
// or the low half of the 2nd operand (the inserted vector).
668+
for (unsigned i = 0; i != NumElts / 2; ++i) {
669+
unsigned Idx = Imm ? i : (i + NumElts);
670+
Idxs2.push_back(Builder.getInt32(Idx));
671+
}
672+
// The high half of the result is either the low half of the 2nd operand
673+
// (the inserted vector) or the high half of the 1st operand.
674+
for (unsigned i = NumElts / 2; i != NumElts; ++i) {
675+
unsigned Idx = Imm ? (i + NumElts / 2) : i;
676+
Idxs2.push_back(Builder.getInt32(Idx));
677+
}
678+
Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2));
629679
} else {
630680
bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
631681
if (Name == "llvm.x86.avx.vpermil.pd.256")

‎llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

+36
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,41 @@
11
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
22

3+
; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
4+
5+
define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
6+
; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
7+
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
8+
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
9+
ret <4 x double> %res
10+
}
11+
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
12+
13+
define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
14+
; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
15+
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
16+
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
17+
ret <8 x float> %res
18+
}
19+
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
20+
21+
define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
22+
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
23+
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
24+
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
25+
ret <8 x i32> %res
26+
}
27+
28+
; Verify that high bits of the immediate are masked off. This should be the equivalent
29+
; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
30+
; not a vinsertf128 $1.
31+
define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
32+
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
33+
; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
34+
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
35+
ret <8 x i32> %res
36+
}
37+
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
38+
339
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
440
; CHECK: vblendpd
541
%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]

‎llvm/test/CodeGen/X86/avx-intrinsics-x86.ll

-24
Original file line numberDiff line numberDiff line change
@@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
21872187
declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
21882188

21892189

2190-
define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) {
2191-
; CHECK: vinsertf128
2192-
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
2193-
ret <4 x double> %res
2194-
}
2195-
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
2196-
2197-
2198-
define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) {
2199-
; CHECK: vinsertf128
2200-
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
2201-
ret <8 x float> %res
2202-
}
2203-
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
2204-
2205-
2206-
define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) {
2207-
; CHECK: vinsertf128
2208-
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
2209-
ret <8 x i32> %res
2210-
}
2211-
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
2212-
2213-
22142190
define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
22152191
; CHECK: vperm2f128
22162192
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]

‎llvm/test/CodeGen/X86/avx-vinsertf128.ll

+11-31
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
2-
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s
1+
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
32

3+
; CHECK-LABEL: A:
44
; CHECK-NOT: vunpck
55
; CHECK: vinsertf128 $1
66
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
@@ -9,6 +9,7 @@ entry:
99
ret <8 x float> %shuffle
1010
}
1111

12+
; CHECK-LABEL: B:
1213
; CHECK-NOT: vunpck
1314
; CHECK: vinsertf128 $1
1415
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
@@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
2223
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
2324

2425
; Just check that no crash happens
25-
; CHECK-SSE: _insert_crash
26+
; CHECK-LABEL: _insert_crash:
2627
define void @insert_crash() nounwind {
2728
allocas:
2829
%v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -39,15 +40,15 @@ allocas:
3940

4041
;; DAG Combine must remove useless vinsertf128 instructions
4142

42-
; CHECK: DAGCombineA
43+
; CHECK-LABEL: DAGCombineA:
4344
; CHECK-NOT: vinsertf128 $1
4445
define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
4546
%1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4647
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4748
ret <4 x i32> %2
4849
}
4950

50-
; CHECK: DAGCombineB
51+
; CHECK-LABEL: DAGCombineB:
5152
; CHECK: vpaddd %xmm
5253
; CHECK-NOT: vinsertf128 $1
5354
; CHECK: vpaddd %xmm
@@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
5758
ret <8 x i32> %2
5859
}
5960

60-
; CHECK: insert_pd
61-
define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) {
62-
; CHECK: vinsertf128
63-
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0)
64-
ret <4 x double> %res
65-
}
66-
67-
; CHECK: insert_undef_pd
61+
; CHECK-LABEL: insert_undef_pd:
6862
define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
6963
; CHECK: vmovaps %ymm1, %ymm0
7064
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
@@ -73,14 +67,7 @@ ret <4 x double> %res
7367
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
7468

7569

76-
; CHECK: insert_ps
77-
define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) {
78-
; CHECK: vinsertf128
79-
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0)
80-
ret <8 x float> %res
81-
}
82-
83-
; CHECK: insert_undef_ps
70+
; CHECK-LABEL: insert_undef_ps:
8471
define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
8572
; CHECK: vmovaps %ymm1, %ymm0
8673
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
@@ -89,14 +76,7 @@ ret <8 x float> %res
8976
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
9077

9178

92-
; CHECK: insert_si
93-
define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) {
94-
; CHECK: vinsertf128
95-
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0)
96-
ret <8 x i32> %res
97-
}
98-
99-
; CHECK: insert_undef_si
79+
; CHECK-LABEL: insert_undef_si:
10080
define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
10181
; CHECK: vmovaps %ymm1, %ymm0
10282
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
@@ -105,7 +85,7 @@ ret <8 x i32> %res
10585
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
10686

10787
; rdar://10643481
108-
; CHECK: vinsertf128_combine
88+
; CHECK-LABEL: vinsertf128_combine:
10989
define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
11090
; CHECK-NOT: vmovaps
11191
; CHECK: vinsertf128
@@ -118,7 +98,7 @@ entry:
11898
}
11999

120100
; rdar://11076953
121-
; CHECK: vinsertf128_ucombine
101+
; CHECK-LABEL: vinsertf128_ucombine:
122102
define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
123103
; CHECK-NOT: vmovups
124104
; CHECK: vinsertf128

‎llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll

-52
Original file line numberDiff line numberDiff line change
@@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
4848
; Merge two consecutive 16-byte subvector loads into a single 32-byte load
4949
; if it's faster.
5050

51-
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
52-
53-
; Use the vinsertf128 intrinsic to model source code
54-
; that explicitly uses AVX intrinsics.
55-
define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
56-
; CHECK-LABEL: combine_16_byte_loads
57-
58-
; SANDYB: vmovups
59-
; SANDYB-NEXT: vinsertf128
60-
; SANDYB-NEXT: retq
61-
62-
; BTVER2: vmovups
63-
; BTVER2-NEXT: retq
64-
65-
; HASWELL: vmovups
66-
; HASWELL-NEXT: retq
67-
68-
%ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
69-
%ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
70-
%v1 = load <4 x float>, <4 x float>* %ptr1, align 1
71-
%v2 = load <4 x float>, <4 x float>* %ptr2, align 1
72-
%shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
73-
%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
74-
ret <8 x float> %v3
75-
}
76-
77-
; Swap the operands of the shufflevector and vinsertf128 to ensure that the
78-
; pattern still matches.
79-
define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
80-
; CHECK-LABEL: combine_16_byte_loads_swap
81-
82-
; SANDYB: vmovups
83-
; SANDYB-NEXT: vinsertf128
84-
; SANDYB-NEXT: retq
85-
86-
; BTVER2: vmovups
87-
; BTVER2-NEXT: retq
88-
89-
; HASWELL: vmovups
90-
; HASWELL-NEXT: retq
91-
92-
%ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
93-
%ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
94-
%v1 = load <4 x float>, <4 x float>* %ptr1, align 1
95-
%v2 = load <4 x float>, <4 x float>* %ptr2, align 1
96-
%shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
97-
%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
98-
ret <8 x float> %v3
99-
}
100-
101-
; Replace the vinsertf128 intrinsic with a shufflevector as might be
102-
; expected from auto-vectorized code.
10351
define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
10452
; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
10553

0 commit comments

Comments
 (0)
Please sign in to comment.