Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -259,9 +259,9 @@ return true; } -/// If this is a bitcast to narrow elements from a shuffle of wider elements, -/// try to bitcast the source vector to the narrow type followed by shuffle. -/// This can enable further transforms by moving bitcasts or shuffles together. +/// If this is a bitcast of a shuffle, try to bitcast the source vector to the +/// destination type followed by shuffle. This can enable further transforms by +/// moving bitcasts or shuffles together. static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) { Value *V; ArrayRef Mask; @@ -269,32 +269,39 @@ m_Mask(Mask)))))) return false; + // Disallow non-vector casts and length-changing shuffles. + // TODO: We could allow any shuffle. auto *DestTy = dyn_cast(I.getType()); auto *SrcTy = cast(V->getType()); if (!DestTy || I.getOperand(0)->getType() != SrcTy) return false; - // TODO: Handle bitcast from narrow element type to wide element type. - unsigned DestNumElts = DestTy->getNumElements(); - unsigned SrcNumElts = SrcTy->getNumElements(); - if (SrcNumElts > DestNumElts) - return false; - // The new shuffle must not cost more than the old shuffle. The bitcast is // moved ahead of the shuffle, so assume that it has the same cost as before. if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) return false; - // Bitcast the source vector and expand the shuffle mask to the equivalent for - // narrow elements. + unsigned DestNumElts = DestTy->getNumElements(); + unsigned SrcNumElts = SrcTy->getNumElements(); + SmallVector NewMask; + if (SrcNumElts <= DestNumElts) { + // The bitcast is from wide to narrow/equal elements. The shuffle mask can + // always be expanded to the equivalent form choosing narrower elements. + assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = DestNumElts / SrcNumElts; + narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); + } else { + // The bitcast is from narrow elements to wide elements. The shuffle mask + // must choose consecutive elements to allow casting first. + assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = SrcNumElts / DestNumElts; + if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) + return false; + } // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' IRBuilder<> Builder(&I); Value *CastV = Builder.CreateBitCast(V, DestTy); - SmallVector NewMask; - assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); - unsigned ScaleFactor = DestNumElts / SrcNumElts; - narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask); I.replaceAllUsesWith(Shuf); Index: llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll +++ llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll @@ -47,17 +47,13 @@ ret <2 x i64> %bc5 } -; TODO: Eliminate redundant shuffles +; Eliminate redundant shuffles define <2 x i64> @shuffle_8_add_32_shuffle_8_masks_are_eq(<2 x i64> %v) { ; CHECK-LABEL: @shuffle_8_add_32_shuffle_8_masks_are_eq( -; CHECK-NEXT: [[BC0:%.*]] = bitcast <2 x i64> [[V:%.*]] to <16 x i8> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[BC0]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[SHUFFLE]] to <4 x i32> -; CHECK-NEXT: [[ADD_I:%.*]] = shl <4 x i32> [[BC2]], -; CHECK-NEXT: [[BC4:%.*]] = bitcast <4 x i32> [[ADD_I]] to <16 x i8> -; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <16 x i8> [[BC4]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC5:%.*]] = bitcast <16 x i8> [[SHUFFLE4]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[BC5:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[BC5]] ; %bc0 = bitcast <2 x i64> %v to <16 x i8> @@ -126,15 +122,14 @@ ret <16 x i8> %add } -; TODO: Sink single shuffle. +; Sink single shuffle. define <4 x i32> @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(<8 x i16> %v1, <8 x i16> %v2) { ; CHECK-LABEL: @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[V1:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[V2:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <8 x i16> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[SHUFFLE2]] to <4 x i32> -; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[BC2]], [[BC1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[V2:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[ADD]] ; %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> @@ -145,15 +140,14 @@ ret <4 x i32> %add } -; TODO: Sink single shuffle. +; Sink single shuffle. define <4 x i32> @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up(<16 x i8> %v1, <16 x i8> %v2) { ; CHECK-LABEL: @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x i8> [[V2:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[SHUFFLE2]] to <4 x i32> -; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[BC2]], [[BC1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[V2:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[ADD]] ; %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> @@ -229,13 +223,13 @@ } ; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<16 x i8>(v))) -; TODO: Narrow, squash shuffles, and widen type? +; TODO: squash shuffles? define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(<16 x i8> %v1) { ; CHECK-LABEL: @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE2]] ; %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> @@ -245,13 +239,13 @@ } ; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<8 x i16>(v))) -; TODO: Narrow, squash shuffles, and widen type? +; TODO: squash shuffles? define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up(<8 x i16> %v1) { ; CHECK-LABEL: @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[V1:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <8 x i16> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE2]] ; %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> @@ -293,13 +287,13 @@ } ; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<16 x i8>(v))) -; TODO: Narrow, squash shuffles, and widen type? +; TODO: squash shuffles and widen type? define <8 x i16> @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up(<16 x i8> %v1) { ; CHECK-LABEL: @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <8 x i16> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[BC1]], <8 x i16> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[SHUFFLE2]] ; %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> Index: llvm/test/Transforms/VectorCombine/X86/shuffle.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -59,13 +59,13 @@ ret i128 %r } -; Negative test - but might want to try this +; Widen shuffle elements define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { ; CHECK-LABEL: @bitcast_shuf_wide_element( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i16> [[SHUF]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; %shuf = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %r = bitcast <8 x i16> %shuf to <4 x i32>