Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -244,6 +245,49 @@ return true; } +/// If this is a bitcast to narrow elements from a shuffle of wider elements, +/// try to bitcast the source vector to the narrow type followed by shuffle. +/// This can enable further transforms by moving bitcasts or shuffles together. +static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) { + Value *V; + ArrayRef Mask; + if (!match(&I, m_BitCast(m_OneUse(m_ShuffleVector(m_Value(V), m_Undef(), + m_Mask(Mask)))))) + return false; + + Type *DestTy = I.getType(); + Type *SrcTy = V->getType(); + if (!DestTy->isVectorTy() || I.getOperand(0)->getType() != SrcTy) + return false; + + // TODO: Handle bitcast from narrow element type to wide element type. + assert(SrcTy->isVectorTy() && "Shuffle of non-vector type?"); + unsigned DestNumElts = DestTy->getVectorNumElements(); + unsigned SrcNumElts = SrcTy->getVectorNumElements(); + if (SrcNumElts > DestNumElts) + return false; + + // The new shuffle must not cost more than the old shuffle. The bitcast is + // moved ahead of the shuffle, so assume that it has the same cost as before. + if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) + return false; + + // Bitcast the source vector and expand the shuffle mask to the equivalent for + // narrow elements. + // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' + IRBuilder<> Builder(&I); + Value *CastV = Builder.CreateBitCast(V, DestTy); + SmallVector NewMask; + assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = DestNumElts / SrcNumElts; + scaleShuffleMask(ScaleFactor, Mask, NewMask); + Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), + NewMask); + I.replaceAllUsesWith(Shuf); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, const TargetTransformInfo &TTI, @@ -265,6 +309,7 @@ if (isa(I)) continue; MadeChange |= foldExtractExtract(I, TTI); + MadeChange |= foldBitcastShuf(I, TTI); } } Index: llvm/test/Transforms/VectorCombine/X86/shuffle.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -2,28 +2,39 @@ ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX +; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb) + define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) { -; CHECK-LABEL: @bitcast_shuf_narrow_element( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> -; CHECK-NEXT: ret <16 x i8> [[R]] +; SSE-LABEL: @bitcast_shuf_narrow_element( +; SSE-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> +; SSE-NEXT: ret <16 x i8> [[R]] +; +; AVX-LABEL: @bitcast_shuf_narrow_element( +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; AVX-NEXT: ret <16 x i8> [[TMP2]] ; %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %r = bitcast <4 x i32> %shuf to <16 x i8> ret <16 x i8> %r } +; v4f32 is the same cost as v4i32, so this always works + define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_same_size( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %r = bitcast <4 x i32> %shuf to <4 x float> ret <4 x float> %r } +; Negative test - length-changing shuffle + define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_size( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <4 x i32> @@ -35,6 +46,8 @@ ret <16 x i8> %r } +; Negative test - must cast to vector type + define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> @@ -46,6 +59,8 @@ ret i128 %r } +; Negative test - but might want to try this + define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { ; CHECK-LABEL: @bitcast_shuf_wide_element( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> @@ -59,6 +74,8 @@ declare void @use(<4 x i32>) +; Negative test - don't create an extra shuffle + define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_uses( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> @@ -73,15 +90,25 @@ } define <2 x i64> @PR35454_1(<2 x i64> %v) { -; CHECK-LABEL: @PR35454_1( -; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> -; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> -; CHECK-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], -; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> -; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> -; CHECK-NEXT: ret <2 x i64> [[BC3]] +; SSE-LABEL: @PR35454_1( +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> +; SSE-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], +; SSE-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> +; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[BC3]] +; +; AVX-LABEL: @PR35454_1( +; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <16 x i8> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; AVX-NEXT: [[ADD:%.*]] = shl <16 x i8> [[TMP2]], +; AVX-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> +; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; AVX-NEXT: ret <2 x i64> [[BC3]] ; %bc = bitcast <2 x i64> %v to <4 x i32> %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> @@ -94,15 +121,25 @@ } define <2 x i64> @PR35454_2(<2 x i64> %v) { -; CHECK-LABEL: @PR35454_2( -; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> -; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> -; CHECK-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], -; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> -; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> -; CHECK-NEXT: ret <2 x i64> [[BC3]] +; SSE-LABEL: @PR35454_2( +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> +; SSE-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], +; SSE-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> +; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[BC3]] +; +; AVX-LABEL: @PR35454_2( +; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <8 x i16> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; AVX-NEXT: [[ADD:%.*]] = shl <8 x i16> [[TMP2]], +; AVX-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> +; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; AVX-NEXT: ret <2 x i64> [[BC3]] ; %bc = bitcast <2 x i64> %v to <4 x i32> %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32>