diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h --- a/llvm/include/llvm/IR/Value.h +++ b/llvm/include/llvm/IR/Value.h @@ -444,6 +444,9 @@ /// This is logically equivalent to getNumUses() >= N. bool hasNUsesOrMore(unsigned N) const; + /// Return true if this value is the only use of \p V. + bool isOnlyUserOf(const Value *V) const; + /// Return true if there is exactly one user of this value that cannot be /// dropped. /// diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -141,6 +141,17 @@ return hasNItemsOrMore(use_begin(), use_end(), N); } +bool Value::isOnlyUserOf(const Value *V) const { + bool Seen = false; + for (auto *U : V->users()) { + if (U == this) + Seen = true; + else + return false; + } + return Seen; +} + static bool isUnDroppableUser(const User *U) { return !U->isDroppable(); } Use *Value::getSingleUndroppableUse() { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1300,11 +1300,12 @@ if (!I) return nullptr; // Only analyze instructions. bool MadeChange = false; - auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum, - APInt Demanded, APInt &Undef) { + auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum, APInt Demanded, + APInt &Undef, bool AllowMultipleUsers = false) { auto *II = dyn_cast(Inst); Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum); - if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) { + if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1, + AllowMultipleUsers)) { if (II) II->setArgOperand(OpNum, V); else @@ -1690,7 +1691,12 @@ case Intrinsic::x86_sse2_min_sd: case Intrinsic::x86_sse2_max_sd: case Intrinsic::x86_sse2_cmp_sd: { - simplifyAndSetOp(II, 0, DemandedElts, UndefElts); + // If the args are duplicated and are only used by this instruction then + // arg0 is the most demanded elts we need. + bool OnlyUser = II->getArgOperand(0) == II->getArgOperand(1) && + II->isOnlyUserOf(II->getArgOperand(0)); + + simplifyAndSetOp(II, 0, DemandedElts, UndefElts, OnlyUser); // If lowest element of a scalar op isn't used then use Arg0. if (!DemandedElts[0]) { @@ -1698,9 +1704,11 @@ return II->getArgOperand(0); } - // Only lower element is used for operand 1. - DemandedElts = 1; - simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); + if (!OnlyUser) { + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); + } // Lower element is undefined if both lower elements are undefined. // Consider things like undef&0. The result is known zero, not undef. diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll @@ -308,7 +308,7 @@ define float @test_min_ss_3(float %a) { ; CHECK-LABEL: @test_min_ss_3( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP1]], <4 x float> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: ret float [[TMP3]] @@ -368,7 +368,7 @@ define float @test_max_ss_4(float %a) { ; CHECK-LABEL: @test_max_ss_4( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP1]], <4 x float> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: ret float [[TMP3]] @@ -428,7 +428,7 @@ define float @test_cmp_ss_2(float %a) { ; CHECK-LABEL: @test_cmp_ss_2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP1]], <4 x float> [[TMP1]], i8 3) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: ret float [[TMP3]] diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll @@ -219,7 +219,7 @@ define double @test_min_sd_2(double %a) { ; CHECK-LABEL: @test_min_sd_2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> [[TMP1]], <2 x double> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: ret double [[TMP3]] @@ -272,7 +272,7 @@ define double @test_max_sd_2(double %a) { ; CHECK-LABEL: @test_max_sd_2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> [[TMP1]], <2 x double> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: ret double [[TMP3]] @@ -325,7 +325,7 @@ define double @test_cmp_sd_2(double %a) { ; CHECK-LABEL: @test_cmp_sd_2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP1]], <2 x double> [[TMP1]], i8 3) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: ret double [[TMP3]]