Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1082,6 +1082,7 @@ if (!VTy) break; unsigned InVWidth = VTy->getNumElements(); APInt InputDemandedElts(InVWidth, 0); + UndefElts2 = APInt(InVWidth, 0); unsigned Ratio; if (VWidth == InVWidth) { @@ -1089,29 +1090,25 @@ // elements as are demanded of us. Ratio = 1; InputDemandedElts = DemandedElts; - } else if (VWidth > InVWidth) { - // Untested so far. - break; - - // If there are more elements in the result than there are in the source, - // then an input element is live if any of the corresponding output - // elements are live. - Ratio = VWidth/InVWidth; - for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an input element is live if any of the + // corresponding output elements are live. + Ratio = VWidth / InVWidth; + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) if (DemandedElts[OutIdx]) - InputDemandedElts.setBit(OutIdx/Ratio); - } - } else { - // Untested so far. - break; - - // If there are more elements in the source than there are in the result, - // then an input element is live if the corresponding output element is - // live. - Ratio = InVWidth/VWidth; + InputDemandedElts.setBit(OutIdx / Ratio); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an input element is live if the + // corresponding output element is live. + Ratio = InVWidth / VWidth; for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (DemandedElts[InIdx/Ratio]) + if (DemandedElts[InIdx / Ratio]) InputDemandedElts.setBit(InIdx); + } else { + // Unsupported so far. + break; } // div/rem demand all inputs, because they don't want divide by zero. @@ -1122,24 +1119,26 @@ MadeChange = true; } - UndefElts = UndefElts2; - if (VWidth > InVWidth) { - llvm_unreachable("Unimp"); - // If there are more elements in the result than there are in the source, - // then an output element is undef if the corresponding input element is - // undef. + if (VWidth == InVWidth) { + UndefElts = UndefElts2; + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an output element is undef if the + // corresponding input element is undef. for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) - if (UndefElts2[OutIdx/Ratio]) + if (UndefElts2[OutIdx / Ratio]) + UndefElts.setBit(OutIdx); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an output element is undef if all of the + // corresponding input elements are undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); + if (SubUndef.countPopulation() == Ratio) UndefElts.setBit(OutIdx); - } else if (VWidth < InVWidth) { + } + } else { llvm_unreachable("Unimp"); - // If there are more elements in the source than there are in the result, - // then a result element is undef if all of the corresponding input - // elements are undef. - UndefElts = ~0ULL >> (64-VWidth); // Start out all undef. - for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (!UndefElts2[InIdx]) // Not undef? - UndefElts.clearBit(InIdx/Ratio); // Clear undef bit. } break; } Index: llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -838,6 +838,17 @@ ret <8 x i16> %2 } +define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) { +; CHECK-LABEL: @sse2_psra_w_var_bc +; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <8 x i16> +; CHECK-NEXT: %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) +; CHECK-NEXT: ret <8 x i16> %2 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = bitcast <2 x i64> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %2) + ret <8 x i16> %3 +} + define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @sse2_psra_d_var ; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a) @@ -847,6 +858,17 @@ ret <4 x i32> %2 } +define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) { +; CHECK-LABEL: @sse2_psra_d_var_bc +; CHECK-NEXT: %1 = bitcast <8 x i16> %a to <4 x i32> +; CHECK-NEXT: %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1) +; CHECK-NEXT: ret <4 x i32> %2 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = bitcast <8 x i16> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %2) + ret <4 x i32> %3 +} + define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx2_psra_w_var ; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a) @@ -901,6 +923,17 @@ ret <16 x i16> %2 } +define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) { +; CHECK-LABEL: @avx2_psrl_w_var_bc +; CHECK-NEXT: %1 = bitcast <16 x i8> %a to <8 x i16> +; CHECK-NEXT: %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1) +; CHECK-NEXT: ret <16 x i16> %2 + %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %2) + ret <16 x i16> %3 +} + define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx2_psrl_d_var ; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a) @@ -910,6 +943,17 @@ ret <8 x i32> %2 } +define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx2_psrl_d_var_bc +; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <4 x i32> +; CHECK-NEXT: %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1) +; CHECK-NEXT: ret <8 x i32> %2 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %2) + ret <8 x i32> %3 +} + define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx2_psrl_q_var ; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)