Skip to content

Commit 1a80595

Browse files
committedApr 24, 2014
Optimize some special cases for SSE4a insertqi
Summary: Since the upper 64 bits of the destination register are undefined when performing this operation, we can substitute it and let the optimizer figure out that only a copy is needed. Also added range merging, if an instruction copies a range that can be merged with a previous copied range. Added test cases for both optimizations. Reviewers: grosbach, nadav CC: llvm-commits Differential Revision: http://reviews.llvm.org/D3357 llvm-svn: 207055
1 parent 6072817 commit 1a80595

File tree

2 files changed

+164
-0
lines changed

2 files changed

+164
-0
lines changed
 

Diff for: ‎llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

+67
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,73 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
578578
break;
579579
}
580580

581+
case Intrinsic::x86_sse4a_insertqi: {
582+
// insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
583+
// ones undef
584+
// TODO: eventually we should lower this intrinsic to IR
585+
if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
586+
if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
587+
if (CIWidth->equalsInt(64) && CIStart->isZero()) {
588+
Value *Vec = II->getArgOperand(1);
589+
Value *Undef = UndefValue::get(Vec->getType());
590+
const uint32_t Mask[] = { 0, 2 };
591+
return ReplaceInstUsesWith(
592+
CI,
593+
Builder->CreateShuffleVector(
594+
Vec, Undef, ConstantDataVector::get(
595+
II->getContext(), ArrayRef<uint32_t>(Mask))));
596+
597+
} else if (auto Source =
598+
dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
599+
if (Source->hasOneUse() &&
600+
Source->getArgOperand(1) == II->getArgOperand(1)) {
601+
// If the source of the insert has only one use and it's another
602+
// insert (and they're both inserting from the same vector), try to
603+
// bundle both together.
604+
auto CISourceWidth =
605+
dyn_cast<ConstantInt>(Source->getArgOperand(2));
606+
auto CISourceStart =
607+
dyn_cast<ConstantInt>(Source->getArgOperand(3));
608+
if (CISourceStart && CISourceWidth) {
609+
unsigned Start = CIStart->getZExtValue();
610+
unsigned Width = CIWidth->getZExtValue();
611+
unsigned End = Start + Width;
612+
unsigned SourceStart = CISourceStart->getZExtValue();
613+
unsigned SourceWidth = CISourceWidth->getZExtValue();
614+
unsigned SourceEnd = SourceStart + SourceWidth;
615+
unsigned NewStart, NewWidth;
616+
bool ShouldReplace = false;
617+
if (Start <= SourceStart && SourceStart <= End) {
618+
NewStart = Start;
619+
NewWidth = std::max(End, SourceEnd) - NewStart;
620+
ShouldReplace = true;
621+
} else if (SourceStart <= Start && Start <= SourceEnd) {
622+
NewStart = SourceStart;
623+
NewWidth = std::max(SourceEnd, End) - NewStart;
624+
ShouldReplace = true;
625+
}
626+
627+
if (ShouldReplace) {
628+
Constant *ConstantWidth = ConstantInt::get(
629+
II->getArgOperand(2)->getType(), NewWidth, false);
630+
Constant *ConstantStart = ConstantInt::get(
631+
II->getArgOperand(3)->getType(), NewStart, false);
632+
Value *Args[4] = { Source->getArgOperand(0),
633+
II->getArgOperand(1), ConstantWidth,
634+
ConstantStart };
635+
Module *M = CI.getParent()->getParent()->getParent();
636+
Value *F =
637+
Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
638+
return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
639+
}
640+
}
641+
}
642+
}
643+
}
644+
}
645+
break;
646+
}
647+
581648
case Intrinsic::x86_avx_vpermilvar_ps:
582649
case Intrinsic::x86_avx_vpermilvar_ps_256:
583650
case Intrinsic::x86_avx_vpermilvar_pd:

Diff for: ‎llvm/test/Transforms/InstCombine/vec_demanded_elts.ll

+97
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,103 @@ define <4 x float> @test_select(float %f, float %g) {
209209
ret <4 x float> %ret
210210
}
211211

212+
; We should optimize these two redundant insertqi into one
213+
; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
214+
define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
215+
; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
216+
; CHECK-NOT: insertqi
217+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
218+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
219+
ret <2 x i64> %2
220+
}
221+
222+
; The result of this insert is the second arg, since the top 64 bits of
223+
; the result are undefined, and we copy the bottom 64 bits from the
224+
; second arg
225+
; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
226+
define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
227+
; CHECK: ret <2 x i64> %i
228+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
229+
ret <2 x i64> %1
230+
}
231+
232+
; Test the several types of ranges and ordering that exist for two insertqi
233+
; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
234+
define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
235+
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
236+
; CHECK: ret <2 x i64> %[[RES]]
237+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
238+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
239+
ret <2 x i64> %2
240+
}
241+
242+
; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
243+
define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
244+
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
245+
; CHECK: ret <2 x i64> %[[RES]]
246+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
247+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
248+
ret <2 x i64> %2
249+
}
250+
251+
; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
252+
define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
253+
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
254+
; CHECK: ret <2 x i64> %[[RES]]
255+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
256+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
257+
ret <2 x i64> %2
258+
}
259+
260+
; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
261+
define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
262+
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
263+
; CHECK: ret <2 x i64> %[[RES]]
264+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
265+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
266+
ret <2 x i64> %2
267+
}
268+
269+
; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
270+
define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
271+
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
272+
; CHECK: ret <2 x i64> %[[RES]]
273+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
274+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
275+
ret <2 x i64> %2
276+
}
277+
278+
; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
279+
define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
280+
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
281+
; CHECK: ret <2 x i64> %[[RES]]
282+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
283+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
284+
ret <2 x i64> %2
285+
}
286+
287+
; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
288+
define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
289+
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
290+
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
291+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
292+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
293+
ret <2 x i64> %2
294+
}
295+
296+
; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
297+
define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
298+
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
299+
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
300+
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
301+
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
302+
ret <2 x i64> %2
303+
}
304+
305+
306+
; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
307+
declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
308+
212309
declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
213310
define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
214311
; CHECK-LABEL: @test_vpermilvar_ps(

0 commit comments

Comments
 (0)
Please sign in to comment.