Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -244,6 +245,53 @@ return true; } +static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) { + Value *V; + Constant *MaskC; + if (!match(&I, m_BitCast(m_OneUse(m_ShuffleVector(m_Value(V), m_Undef(), + m_Constant(MaskC)))))) + return false; + + Type *DestTy = I.getType(); + Type *SrcTy = V->getType(); + if (!DestTy->isVectorTy() || I.getOperand(0)->getType() != SrcTy) + return false; + + // TODO: Handle bitcast from narrow element type to wide element type. + assert(SrcTy->isVectorTy() && "Shuffle of non-vector type?"); + unsigned DestNumElts = DestTy->getVectorNumElements(); + unsigned SrcNumElts = SrcTy->getVectorNumElements(); + if (SrcNumElts > DestNumElts) + return false; + + // The new shuffle must not cost more than the old shuffle. The bitcast is + // moved ahead of the shuffle, so assume that it has the same cost as before. + if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) + return false; + + // Bitcast the source vector and expand the shuffle mask to the equivalent for + // narrow elements. + // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' + IRBuilder<> Builder(&I); + Value *CastV = Builder.CreateBitCast(V, DestTy); + SmallVector OldMask, NewMask; + ShuffleVectorInst::getShuffleMask(MaskC, OldMask); + assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = DestNumElts / SrcNumElts; + scaleShuffleMask(ScaleFactor, makeArrayRef(OldMask), NewMask); + + // TODO: IRBuilder has a CreateShuffleVector that takes an array of uint32_t, + // but does not recognize plain 'int'. + SmallVector NewMaskC; + for (unsigned i = 0; i != DestNumElts; ++i) + NewMaskC.push_back(Builder.getInt32(NewMask[i])); + Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), + ConstantVector::get(NewMaskC)); + I.replaceAllUsesWith(Shuf); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, const TargetTransformInfo &TTI, @@ -261,8 +309,10 @@ // use->defs, so we're more likely to succeed by starting from the bottom. // TODO: It could be more efficient to remove dead instructions // iteratively in this loop rather than waiting until the end. - for (Instruction &I : make_range(BB.rbegin(), BB.rend())) + for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldExtractExtract(I, TTI); + MadeChange |= foldBitcastShuf(I, TTI); + } } // We're done with transforms, so remove dead instructions. Index: llvm/test/Transforms/VectorCombine/X86/shuffle.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -2,28 +2,39 @@ ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX +; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb) + define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) { -; CHECK-LABEL: @bitcast_shuf_narrow_element( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> -; CHECK-NEXT: ret <16 x i8> [[R]] +; SSE-LABEL: @bitcast_shuf_narrow_element( +; SSE-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> +; SSE-NEXT: ret <16 x i8> [[R]] +; +; AVX-LABEL: @bitcast_shuf_narrow_element( +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; AVX-NEXT: ret <16 x i8> [[TMP2]] ; %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %r = bitcast <4 x i32> %shuf to <16 x i8> ret <16 x i8> %r } +; v4f32 is the same cost as v4i32, so this always works + define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_same_size( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %r = bitcast <4 x i32> %shuf to <4 x float> ret <4 x float> %r } +; Negative test - length-changing shuffle + define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_size( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <4 x i32> @@ -35,6 +46,8 @@ ret <16 x i8> %r } +; Negative test - must cast to vector type + define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> @@ -46,6 +59,8 @@ ret i128 %r } +; Negative test - but might want to try this + define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { ; CHECK-LABEL: @bitcast_shuf_wide_element( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> @@ -59,6 +74,8 @@ declare void @use(<4 x i32>) +; Negative test - don't create an extra shuffle + define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_uses( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> @@ -73,15 +90,25 @@ } define <2 x i64> @PR35454_1(<2 x i64> %v) { -; CHECK-LABEL: @PR35454_1( -; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> -; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> -; CHECK-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], -; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> -; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> -; CHECK-NEXT: ret <2 x i64> [[BC3]] +; SSE-LABEL: @PR35454_1( +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> +; SSE-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], +; SSE-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> +; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[BC3]] +; +; AVX-LABEL: @PR35454_1( +; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <16 x i8> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; AVX-NEXT: [[ADD:%.*]] = shl <16 x i8> [[TMP2]], +; AVX-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> +; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; AVX-NEXT: ret <2 x i64> [[BC3]] ; %bc = bitcast <2 x i64> %v to <4 x i32> %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> @@ -94,15 +121,25 @@ } define <2 x i64> @PR35454_2(<2 x i64> %v) { -; CHECK-LABEL: @PR35454_2( -; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> -; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> -; CHECK-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], -; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> -; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> -; CHECK-NEXT: ret <2 x i64> [[BC3]] +; SSE-LABEL: @PR35454_2( +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> +; SSE-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], +; SSE-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> +; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[BC3]] +; +; AVX-LABEL: @PR35454_2( +; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <8 x i16> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; AVX-NEXT: [[ADD:%.*]] = shl <8 x i16> [[TMP2]], +; AVX-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> +; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; AVX-NEXT: ret <2 x i64> [[BC3]] ; %bc = bitcast <2 x i64> %v to <4 x i32> %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32>