Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -576,6 +576,72 @@ break; } + case Intrinsic::x86_sse4a_insertqi: { + // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top + // ones undef + if (auto CIWidth = dyn_cast(II->getArgOperand(2))) { + if (auto CIStart = dyn_cast(II->getArgOperand(3))) { + if (CIWidth->equalsInt(64) && CIStart->isZero()) { + Value *Vec = II->getArgOperand(1); + Value *Undef = UndefValue::get(Vec->getType()); + const uint32_t Mask[] = { 0, 2 }; + return ReplaceInstUsesWith( + CI, + Builder->CreateShuffleVector( + Vec, Undef, ConstantDataVector::get( + II->getContext(), ArrayRef(Mask)))); + + } else if (auto Source = + dyn_cast(II->getArgOperand(0))) { + if (Source->hasOneUse() && + Source->getArgOperand(1) == II->getArgOperand(1)) { + // If the source of the insert has only one use and it's another + // insert (and they're both inserting from the same vector), try to + // bundle both together. + auto CISourceWidth = + dyn_cast(Source->getArgOperand(2)); + auto CISourceStart = + dyn_cast(Source->getArgOperand(3)); + if (CISourceStart && CISourceWidth) { + unsigned Start = CIStart->getZExtValue(); + unsigned Width = CIWidth->getZExtValue(); + unsigned End = Start + Width; + unsigned SourceStart = CISourceStart->getZExtValue(); + unsigned SourceWidth = CISourceWidth->getZExtValue(); + unsigned SourceEnd = SourceStart + SourceWidth; + unsigned NewStart, NewWidth; + bool ShouldReplace = false; + if (Start <= SourceStart && SourceStart <= End) { + NewStart = Start; + NewWidth = std::max(End, SourceEnd) - NewStart; + ShouldReplace = true; + } else if (SourceStart <= Start && Start <= SourceEnd) { + NewStart = SourceStart; + NewWidth = std::max(SourceEnd, End) - NewStart; + ShouldReplace = true; + } + + if (ShouldReplace) { + Constant *ConstantWidth = ConstantInt::get( + II->getArgOperand(2)->getType(), NewWidth, false); + Constant *ConstantStart = ConstantInt::get( + II->getArgOperand(3)->getType(), NewStart, false); + Value *Args[4] = { Source->getArgOperand(0), + II->getArgOperand(1), ConstantWidth, + ConstantStart }; + Module *M = CI.getParent()->getParent()->getParent(); + Value *F = + Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); + } + } + } + } + } + } + break; + } + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. if (Constant *Mask = dyn_cast(II->getArgOperand(2))) { Index: test/Transforms/InstCombine/vec_demanded_elts.ll =================================================================== --- test/Transforms/InstCombine/vec_demanded_elts.ll +++ test/Transforms/InstCombine/vec_demanded_elts.ll @@ -209,4 +209,100 @@ ret <4 x float> %ret } +; We should optimize these two redundant insertqi into one +; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { +; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) +; CHECK-NOT: insertqi + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) + ret <2 x i64> %2 +} + +; The result of this insert is the second arg, since the top 64 bits of +; the result are undefined, and we copy the bottom 64 bits from the +; second arg +; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> %i + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) + ret <2 x i64> %1 +} + +; Test the several types of ranges and ordering that exist for two insertqi +; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi +declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind