Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1140,7 +1140,8 @@ return true; } -static Instruction *foldSelectShuffles(ShuffleVectorInst &Shuf) { +static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, + InstCombiner::BuilderTy &Builder) { // Folds under here require the equivalent of a vector select. if (!Shuf.isSelect()) return nullptr; @@ -1150,16 +1151,14 @@ !match(Shuf.getOperand(1), m_BinOp(B1))) return nullptr; - // TODO: Fold the case with different variable operands (requires creating a - // new shuffle and checking number of uses). - Value *X; + Value *X, *Y; Constant *C0, *C1; bool ConstantsAreOp1; if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) && - match(B1, m_BinOp(m_Specific(X), m_Constant(C1)))) + match(B1, m_BinOp(m_Value(Y), m_Constant(C1)))) ConstantsAreOp1 = true; else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) && - match(B1, m_BinOp(m_Constant(C1), m_Specific(X)))) + match(B1, m_BinOp(m_Constant(C1), m_Value(Y)))) ConstantsAreOp1 = false; else return nullptr; @@ -1191,9 +1190,36 @@ // The opcodes must be the same. Use a new name to make that clear. BinaryOperator::BinaryOps BOpc = Opc0; - // Remove a binop and the shuffle by rearranging the constant: - // shuffle (op X, C0), (op X, C1), M --> op X, C' - // shuffle (op C0, X), (op C1, X), M --> op C', X + Value *V; + if (X == Y) { + // Remove a binop and the shuffle by rearranging the constant: + // shuffle (op V, C0), (op V, C1), M --> op V, C' + // shuffle (op C0, V), (op C1, V), M --> op C', V + V = X; + } else if (!Instruction::isIntDivRem(BOpc) && + (B0->hasOneUse() || B1->hasOneUse())) { + // If there are 2 different variable operands, we must create a new shuffle + // (select) first, so check uses to ensure that we don't end up with more + // instructions than we started with. + // + // Note: In general, we do not create new shuffles in InstCombine because we + // do not know if a target can lower an arbitrary shuffle optimally. In this + // case, the shuffle uses the existing mask, so there is no additional risk. + // + // TODO: We are disallowing div/rem because a shuffle with an undef mask + // element would propagate an undef value to the div/rem. That's not + // safe in general because div/rem allow for undefined behavior. We can + // loosen this restriction (eg, check if the mask has no undefs or replace + // undef elements). + + // Select the variable vectors first, then perform the binop: + // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C' + // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M) + V = Builder.CreateShuffleVector(X, Y, Shuf.getMask()); + } else { + return nullptr; + } + Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Shuf.getMask()); // If the shuffle mask contains undef elements, then the new constant @@ -1202,8 +1228,8 @@ if (Instruction::isIntDivRem(BOpc)) NewC = getSafeVectorConstantForIntDivRem(NewC); - Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, X, NewC) : - BinaryOperator::Create(BOpc, NewC, X); + Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) : + BinaryOperator::Create(BOpc, NewC, V); // Flags are intersected from the 2 source binops. NewBO->copyIRFlags(B0); @@ -1223,7 +1249,7 @@ LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI))) return replaceInstUsesWith(SVI, V); - if (Instruction *I = foldSelectShuffles(SVI)) + if (Instruction *I = foldSelectShuffle(SVI, Builder)) return I; bool MadeChange = false; Index: llvm/trunk/test/Transforms/InstCombine/shuffle_select.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/shuffle_select.ll +++ llvm/trunk/test/Transforms/InstCombine/shuffle_select.ll @@ -241,9 +241,8 @@ define <4 x i32> @add_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @add_2_vars( -; CHECK-NEXT: [[T1:%.*]] = add <4 x i32> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = add <4 x i32> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = add <4 x i32> [[TMP1]], ; CHECK-NEXT: ret <4 x i32> [[T3]] ; %t1 = add <4 x i32> %v0, @@ -256,9 +255,8 @@ define <4 x i32> @sub_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @sub_2_vars( -; CHECK-NEXT: [[T1:%.*]] = sub <4 x i32> , [[V0:%.*]] -; CHECK-NEXT: [[T2:%.*]] = sub <4 x i32> , [[V1:%.*]] -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = sub <4 x i32> , [[TMP1]] ; CHECK-NEXT: ret <4 x i32> [[T3]] ; %t1 = sub <4 x i32> , %v0 @@ -272,9 +270,8 @@ define <4 x i32> @mul_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @mul_2_vars( -; CHECK-NEXT: [[T1:%.*]] = mul <4 x i32> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = mul <4 x i32> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = mul <4 x i32> [[TMP1]], ; CHECK-NEXT: ret <4 x i32> [[T3]] ; %t1 = mul <4 x i32> %v0, @@ -287,9 +284,8 @@ define <4 x i32> @shl_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @shl_2_vars( -; CHECK-NEXT: [[T1:%.*]] = shl nsw <4 x i32> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = shl nsw <4 x i32> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = shl nsw <4 x i32> [[TMP1]], ; CHECK-NEXT: ret <4 x i32> [[T3]] ; %t1 = shl nsw <4 x i32> %v0, @@ -302,9 +298,8 @@ define <4 x i32> @lshr_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @lshr_2_vars( -; CHECK-NEXT: [[T1:%.*]] = lshr <4 x i32> , [[V0:%.*]] -; CHECK-NEXT: [[T2:%.*]] = lshr exact <4 x i32> , [[V1:%.*]] -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = lshr <4 x i32> , [[TMP1]] ; CHECK-NEXT: ret <4 x i32> [[T3]] ; %t1 = lshr <4 x i32> , %v0 @@ -317,9 +312,8 @@ define <3 x i32> @ashr_2_vars(<3 x i32> %v0, <3 x i32> %v1) { ; CHECK-LABEL: @ashr_2_vars( -; CHECK-NEXT: [[T1:%.*]] = ashr <3 x i32> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = ashr <3 x i32> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <3 x i32> [[T1]], <3 x i32> [[T2]], <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[V0:%.*]], <3 x i32> [[V1:%.*]], <3 x i32> +; CHECK-NEXT: [[T3:%.*]] = ashr <3 x i32> [[TMP1]], ; CHECK-NEXT: ret <3 x i32> [[T3]] ; %t1 = ashr <3 x i32> %v0, @@ -330,9 +324,8 @@ define <3 x i42> @and_2_vars(<3 x i42> %v0, <3 x i42> %v1) { ; CHECK-LABEL: @and_2_vars( -; CHECK-NEXT: [[T1:%.*]] = and <3 x i42> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = and <3 x i42> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <3 x i42> [[T1]], <3 x i42> [[T2]], <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i42> [[V0:%.*]], <3 x i42> [[V1:%.*]], <3 x i32> +; CHECK-NEXT: [[T3:%.*]] = and <3 x i42> [[TMP1]], ; CHECK-NEXT: ret <3 x i42> [[T3]] ; %t1 = and <3 x i42> %v0, @@ -346,8 +339,8 @@ define <4 x i32> @or_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @or_2_vars( ; CHECK-NEXT: [[T1:%.*]] = or <4 x i32> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = or <4 x i32> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = or <4 x i32> [[TMP1]], ; CHECK-NEXT: call void @use_v4i32(<4 x i32> [[T1]]) ; CHECK-NEXT: ret <4 x i32> [[T3]] ; @@ -392,6 +385,8 @@ ret <4 x i32> %t3 } +; TODO: If the shuffle has no undefs, it's safe to shuffle the variables first. + define <4 x i32> @sdiv_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @sdiv_2_vars( ; CHECK-NEXT: [[T1:%.*]] = sdiv <4 x i32> [[V0:%.*]], @@ -405,6 +400,8 @@ ret <4 x i32> %t3 } +; TODO: If the shuffle has no undefs, it's safe to shuffle the variables first. + define <4 x i32> @urem_2_vars(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @urem_2_vars( ; CHECK-NEXT: [[T1:%.*]] = urem <4 x i32> , [[V0:%.*]] @@ -435,9 +432,8 @@ define <4 x float> @fadd_2_vars(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: @fadd_2_vars( -; CHECK-NEXT: [[T1:%.*]] = fadd <4 x float> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = fadd <4 x float> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x float> [[T1]], <4 x float> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[V0:%.*]], <4 x float> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = fadd <4 x float> [[TMP1]], ; CHECK-NEXT: ret <4 x float> [[T3]] ; %t1 = fadd <4 x float> %v0, @@ -448,9 +444,8 @@ define <4 x double> @fsub_2_vars(<4 x double> %v0, <4 x double> %v1) { ; CHECK-LABEL: @fsub_2_vars( -; CHECK-NEXT: [[T1:%.*]] = fsub <4 x double> , [[V0:%.*]] -; CHECK-NEXT: [[T2:%.*]] = fsub <4 x double> , [[V1:%.*]] -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x double> [[T1]], <4 x double> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = fsub <4 x double> , [[TMP1]] ; CHECK-NEXT: ret <4 x double> [[T3]] ; %t1 = fsub <4 x double> , %v0 @@ -463,9 +458,8 @@ define <4 x float> @fmul_2_vars(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: @fmul_2_vars( -; CHECK-NEXT: [[T1:%.*]] = fmul reassoc nsz <4 x float> [[V0:%.*]], -; CHECK-NEXT: [[T2:%.*]] = fmul reassoc nsz <4 x float> [[V1:%.*]], -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x float> [[T1]], <4 x float> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[V0:%.*]], <4 x float> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = fmul reassoc nsz <4 x float> [[TMP1]], ; CHECK-NEXT: ret <4 x float> [[T3]] ; %t1 = fmul reassoc nsz <4 x float> %v0, @@ -476,9 +470,8 @@ define <4 x double> @frem_2_vars(<4 x double> %v0, <4 x double> %v1) { ; CHECK-LABEL: @frem_2_vars( -; CHECK-NEXT: [[T1:%.*]] = frem nnan ninf <4 x double> , [[V0:%.*]] -; CHECK-NEXT: [[T2:%.*]] = frem nnan arcp <4 x double> , [[V1:%.*]] -; CHECK-NEXT: [[T3:%.*]] = shufflevector <4 x double> [[T1]], <4 x double> [[T2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = frem nnan <4 x double> , [[TMP1]] ; CHECK-NEXT: ret <4 x double> [[T3]] ; %t1 = frem nnan ninf <4 x double> , %v0