Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -167,7 +167,8 @@ } static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, - InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder, + bool IsBigEndian) { Value *X; uint64_t ExtIndexC; if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) || @@ -186,6 +187,49 @@ if (Value *Elt = findScalarElement(X, ExtIndexC)) return new BitCastInst(Elt, DestTy); + // If the source elements are wider than the destination, try to shift and + // truncate a subset of scalar bits of an insert op. + // TODO: This is limited to integer types, but we could bitcast to/from FP. + if (NumSrcElts < NumElts && SrcTy->getScalarType()->isIntegerTy() && + DestTy->getScalarType()->isIntegerTy()) { + Value *Scalar; + uint64_t InsIndexC; + if (!match(X, m_InsertElement(m_Value(), m_Value(Scalar), + m_ConstantInt(InsIndexC)))) + return nullptr; + + // The extract must be from the subset of vector elements that we inserted + // into. Example: if we inserted element 1 of a <2 x i64> and we are + // extracting an i16 (narrowing ratio = 4), then this extract must be from 1 + // of elements 4-7 of the bitcasted vector. + unsigned NarrowingRatio = NumElts / NumSrcElts; + if (ExtIndexC / NarrowingRatio != InsIndexC) + return nullptr; + + // We are extracting part of the original scalar. How that scalar is + // inserted into the vector depends on the endian-ness. Example: + // Vector Byte Elt Index: 0 1 2 3 4 5 6 7 + // +--+--+--+--+--+--+--+--+ + // inselt <2 x i32> V, S, 1: |V0|V1|V2|V3|S0|S1|S2|S3| + // extelt <4 x i16> V', 3: | |S2|S3| + // +--+--+--+--+--+--+--+--+ + // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value. + // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value. + // In this example, we must right-shift little-endian. Big-endian is just a + // truncate. + unsigned Chunk = ExtIndexC % NarrowingRatio; + if (IsBigEndian) + Chunk = NarrowingRatio - 1 - Chunk; + unsigned ShAmt = Chunk * DestTy->getPrimitiveSizeInBits(); + if (ShAmt) { + // Bail out if we could end with more instructions than we started with. + if (!Ext.getVectorOperand()->hasOneUse()) + return nullptr; + Scalar = Builder.CreateLShr(Scalar, ShAmt); + } + return new TruncInst(Scalar, DestTy); + } + return nullptr; } @@ -224,7 +268,7 @@ } } - if (Instruction *I = foldBitcastExtElt(EI, Builder)) + if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian())) return I; // If there's a vector PHI feeding a scalar use through this extractelement Index: llvm/trunk/test/Transforms/InstCombine/extractelement.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/extractelement.ll +++ llvm/trunk/test/Transforms/InstCombine/extractelement.ll @@ -42,11 +42,14 @@ } define i32 @bitcasted_inselt_wide_source_zero_elt(i64 %x) { -; ANY-LABEL: @bitcasted_inselt_wide_source_zero_elt( -; ANY-NEXT: [[I:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0 -; ANY-NEXT: [[B:%.*]] = bitcast <2 x i64> [[I]] to <4 x i32> -; ANY-NEXT: [[R:%.*]] = extractelement <4 x i32> [[B]], i32 0 -; ANY-NEXT: ret i32 [[R]] +; LE-LABEL: @bitcasted_inselt_wide_source_zero_elt( +; LE-NEXT: [[R:%.*]] = trunc i64 [[X:%.*]] to i32 +; LE-NEXT: ret i32 [[R]] +; +; BE-LABEL: @bitcasted_inselt_wide_source_zero_elt( +; BE-NEXT: [[TMP1:%.*]] = lshr i64 [[X:%.*]], 32 +; BE-NEXT: [[R:%.*]] = trunc i64 [[TMP1]] to i32 +; BE-NEXT: ret i32 [[R]] ; %i = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0 %b = bitcast <2 x i64> %i to <4 x i32> @@ -55,11 +58,14 @@ } define i16 @bitcasted_inselt_wide_source_modulo_elt(i64 %x) { -; ANY-LABEL: @bitcasted_inselt_wide_source_modulo_elt( -; ANY-NEXT: [[I:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1 -; ANY-NEXT: [[B:%.*]] = bitcast <2 x i64> [[I]] to <8 x i16> -; ANY-NEXT: [[R:%.*]] = extractelement <8 x i16> [[B]], i32 4 -; ANY-NEXT: ret i16 [[R]] +; LE-LABEL: @bitcasted_inselt_wide_source_modulo_elt( +; LE-NEXT: [[R:%.*]] = trunc i64 [[X:%.*]] to i16 +; LE-NEXT: ret i16 [[R]] +; +; BE-LABEL: @bitcasted_inselt_wide_source_modulo_elt( +; BE-NEXT: [[TMP1:%.*]] = lshr i64 [[X:%.*]], 48 +; BE-NEXT: [[R:%.*]] = trunc i64 [[TMP1]] to i16 +; BE-NEXT: ret i16 [[R]] ; %i = insertelement <2 x i64> undef, i64 %x, i32 1 %b = bitcast <2 x i64> %i to <8 x i16> @@ -68,11 +74,14 @@ } define i32 @bitcasted_inselt_wide_source_not_modulo_elt(i64 %x) { -; ANY-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt( -; ANY-NEXT: [[I:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0 -; ANY-NEXT: [[B:%.*]] = bitcast <2 x i64> [[I]] to <4 x i32> -; ANY-NEXT: [[R:%.*]] = extractelement <4 x i32> [[B]], i32 1 -; ANY-NEXT: ret i32 [[R]] +; LE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt( +; LE-NEXT: [[TMP1:%.*]] = lshr i64 [[X:%.*]], 32 +; LE-NEXT: [[R:%.*]] = trunc i64 [[TMP1]] to i32 +; LE-NEXT: ret i32 [[R]] +; +; BE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt( +; BE-NEXT: [[R:%.*]] = trunc i64 [[X:%.*]] to i32 +; BE-NEXT: ret i32 [[R]] ; %i = insertelement <2 x i64> undef, i64 %x, i32 0 %b = bitcast <2 x i64> %i to <4 x i32> @@ -81,11 +90,15 @@ } define i8 @bitcasted_inselt_wide_source_not_modulo_elt_not_half(i32 %x) { -; ANY-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half( -; ANY-NEXT: [[I:%.*]] = insertelement <2 x i32> undef, i32 [[X:%.*]], i32 0 -; ANY-NEXT: [[B:%.*]] = bitcast <2 x i32> [[I]] to <8 x i8> -; ANY-NEXT: [[R:%.*]] = extractelement <8 x i8> [[B]], i32 2 -; ANY-NEXT: ret i8 [[R]] +; LE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half( +; LE-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 16 +; LE-NEXT: [[R:%.*]] = trunc i32 [[TMP1]] to i8 +; LE-NEXT: ret i8 [[R]] +; +; BE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half( +; BE-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 8 +; BE-NEXT: [[R:%.*]] = trunc i32 [[TMP1]] to i8 +; BE-NEXT: ret i8 [[R]] ; %i = insertelement <2 x i32> undef, i32 %x, i32 0 %b = bitcast <2 x i32> %i to <8 x i8> @@ -94,11 +107,15 @@ } define i3 @bitcasted_inselt_wide_source_not_modulo_elt_not_half_weird_types(i15 %x) { -; ANY-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half_weird_types( -; ANY-NEXT: [[I:%.*]] = insertelement <3 x i15> undef, i15 [[X:%.*]], i32 0 -; ANY-NEXT: [[B:%.*]] = bitcast <3 x i15> [[I]] to <15 x i3> -; ANY-NEXT: [[R:%.*]] = extractelement <15 x i3> [[B]], i32 1 -; ANY-NEXT: ret i3 [[R]] +; LE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half_weird_types( +; LE-NEXT: [[TMP1:%.*]] = lshr i15 [[X:%.*]], 3 +; LE-NEXT: [[R:%.*]] = trunc i15 [[TMP1]] to i3 +; LE-NEXT: ret i3 [[R]] +; +; BE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half_weird_types( +; BE-NEXT: [[TMP1:%.*]] = lshr i15 [[X:%.*]], 9 +; BE-NEXT: [[R:%.*]] = trunc i15 [[TMP1]] to i3 +; BE-NEXT: ret i3 [[R]] ; %i = insertelement <3 x i15> undef, i15 %x, i32 0 %b = bitcast <3 x i15> %i to <15 x i3> @@ -125,12 +142,19 @@ declare void @use(<8 x i8>) define i8 @bitcasted_inselt_wide_source_uses(i32 %x) { -; ANY-LABEL: @bitcasted_inselt_wide_source_uses( -; ANY-NEXT: [[I:%.*]] = insertelement <2 x i32> undef, i32 [[X:%.*]], i32 0 -; ANY-NEXT: [[B:%.*]] = bitcast <2 x i32> [[I]] to <8 x i8> -; ANY-NEXT: call void @use(<8 x i8> [[B]]) -; ANY-NEXT: [[R:%.*]] = extractelement <8 x i8> [[B]], i32 3 -; ANY-NEXT: ret i8 [[R]] +; LE-LABEL: @bitcasted_inselt_wide_source_uses( +; LE-NEXT: [[I:%.*]] = insertelement <2 x i32> undef, i32 [[X:%.*]], i32 0 +; LE-NEXT: [[B:%.*]] = bitcast <2 x i32> [[I]] to <8 x i8> +; LE-NEXT: call void @use(<8 x i8> [[B]]) +; LE-NEXT: [[R:%.*]] = extractelement <8 x i8> [[B]], i32 3 +; LE-NEXT: ret i8 [[R]] +; +; BE-LABEL: @bitcasted_inselt_wide_source_uses( +; BE-NEXT: [[I:%.*]] = insertelement <2 x i32> undef, i32 [[X:%.*]], i32 0 +; BE-NEXT: [[B:%.*]] = bitcast <2 x i32> [[I]] to <8 x i8> +; BE-NEXT: call void @use(<8 x i8> [[B]]) +; BE-NEXT: [[R:%.*]] = trunc i32 [[X]] to i8 +; BE-NEXT: ret i8 [[R]] ; %i = insertelement <2 x i32> undef, i32 %x, i32 0 %b = bitcast <2 x i32> %i to <8 x i8>