Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2929,16 +2929,10 @@ case Intrinsic::x86_avx_blendv_ps_256: case Intrinsic::x86_avx_blendv_pd_256: case Intrinsic::x86_avx2_pblendvb: { - // Convert blendv* to vector selects if the mask is constant. - // This optimization is convoluted because the intrinsic is defined as - // getting a vector of floats or doubles for the ps and pd versions. - // FIXME: That should be changed. - + // fold (blend A, A, Mask) -> A Value *Op0 = II->getArgOperand(0); Value *Op1 = II->getArgOperand(1); Value *Mask = II->getArgOperand(2); - - // fold (blend A, A, Mask) -> A if (Op0 == Op1) return replaceInstUsesWith(CI, Op0); @@ -2951,6 +2945,20 @@ Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); } + + // Convert to a vector select if we can bypass casts and find a boolean + // vector condition value. + Value *BoolVec; + if (match(peekThroughBitcast(Mask), m_SExt(m_Value(BoolVec)))) { + auto *VTy = dyn_cast(BoolVec->getType()); + if (VTy && VTy->getScalarSizeInBits() == 1 && + VTy->getVectorNumElements() == II->getType()->getVectorNumElements()) + return SelectInst::Create(BoolVec, Op1, Op0); + // TODO: If we can find a boolean vector condition with less elements, + // then we can form a vector select by bitcasting Op0/Op1 to a + // vector type with wider elements and bitcasting the result. + } + break; } Index: llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll +++ llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll @@ -157,9 +157,7 @@ define <4 x float> @sel_v4f32(<4 x float> %x, <4 x float> %y, <4 x i1> %cond) { ; CHECK-LABEL: @sel_v4f32( -; CHECK-NEXT: [[S:%.*]] = sext <4 x i1> [[COND:%.*]] to <4 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <4 x i32> [[S]] to <4 x float> -; CHECK-NEXT: [[R:%.*]] = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[B]]) +; CHECK-NEXT: [[R:%.*]] = select <4 x i1> [[COND:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[X:%.*]] ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = sext <4 x i1> %cond to <4 x i32> @@ -170,9 +168,7 @@ define <2 x double> @sel_v2f64(<2 x double> %x, <2 x double> %y, <2 x i1> %cond) { ; CHECK-LABEL: @sel_v2f64( -; CHECK-NEXT: [[S:%.*]] = sext <2 x i1> [[COND:%.*]] to <2 x i64> -; CHECK-NEXT: [[B:%.*]] = bitcast <2 x i64> [[S]] to <2 x double> -; CHECK-NEXT: [[R:%.*]] = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]], <2 x double> [[B]]) +; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[COND:%.*]], <2 x double> [[Y:%.*]], <2 x double> [[X:%.*]] ; CHECK-NEXT: ret <2 x double> [[R]] ; %s = sext <2 x i1> %cond to <2 x i64> @@ -198,8 +194,7 @@ define <16 x i8> @sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i1> %cond) { ; CHECK-LABEL: @sel_v16i8( -; CHECK-NEXT: [[S:%.*]] = sext <16 x i1> [[COND:%.*]] to <16 x i8> -; CHECK-NEXT: [[R:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[X:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[S]]) +; CHECK-NEXT: [[R:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[X:%.*]] ; CHECK-NEXT: ret <16 x i8> [[R]] ; %s = sext <16 x i1> %cond to <16 x i8> @@ -217,9 +212,7 @@ ; CHECK-LABEL: @sel_v4f32_sse_reality( ; CHECK-NEXT: [[LD:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16 ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt <4 x float> [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32> -; CHECK-NEXT: [[COND:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float> -; CHECK-NEXT: [[R:%.*]] = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> [[LD]], <4 x float> zeroinitializer, <4 x float> [[COND]]) +; CHECK-NEXT: [[R:%.*]] = select <4 x i1> [[CMP]], <4 x float> zeroinitializer, <4 x float> [[LD]] ; CHECK-NEXT: ret <4 x float> [[R]] ; %ld = load <4 x float>, <4 x float>* %x, align 16 @@ -234,9 +227,7 @@ ; CHECK-LABEL: @sel_v2f64_sse_reality( ; CHECK-NEXT: [[LD:%.*]] = load <2 x double>, <2 x double>* [[X:%.*]], align 16 ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt <2 x double> [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64> -; CHECK-NEXT: [[COND:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double> -; CHECK-NEXT: [[R:%.*]] = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> [[LD]], <2 x double> zeroinitializer, <2 x double> [[COND]]) +; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[CMP]], <2 x double> zeroinitializer, <2 x double> [[LD]] ; CHECK-NEXT: ret <2 x double> [[R]] ; %ld = load <2 x double>, <2 x double>* %x, align 16 @@ -247,6 +238,8 @@ ret <2 x double> %r } +; TODO: We can bitcast the inputs to the select and the result and remove the intrinsic. + define <2 x i64> @sel_v4i32_sse_reality(<2 x i64>* nocapture readonly %x, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: @sel_v4i32_sse_reality( ; CHECK-NEXT: [[XCAST:%.*]] = bitcast <2 x i64>* [[X:%.*]] to <16 x i8>* @@ -279,8 +272,7 @@ ; CHECK-NEXT: [[YCAST:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <16 x i8> ; CHECK-NEXT: [[ZCAST:%.*]] = bitcast <2 x i64> [[Z:%.*]] to <16 x i8> ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <16 x i8> [[YCAST]], [[ZCAST]] -; CHECK-NEXT: [[SEXT:%.*]] = sext <16 x i1> [[CMP]] to <16 x i8> -; CHECK-NEXT: [[R:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[LD]], <16 x i8> zeroinitializer, <16 x i8> [[SEXT]]) +; CHECK-NEXT: [[R:%.*]] = select <16 x i1> [[CMP]], <16 x i8> zeroinitializer, <16 x i8> [[LD]] ; CHECK-NEXT: [[RCAST:%.*]] = bitcast <16 x i8> [[R]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[RCAST]] ;