diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1515,6 +1515,57 @@ return CastInst::Create(CastOpcode, NewInsElt, InsElt.getType()); } +/// If we are inserting 2 halves of a value into adjacent elements of a vector, +/// try to convert to a single insert with appropriate bitcasts. +static Instruction *foldTruncInsEltPair(InsertElementInst &InsElt, + bool IsBigEndian, + InstCombiner::BuilderTy &Builder) { + Value *VecOp = InsElt.getOperand(0); + Value *ScalarOp = InsElt.getOperand(1); + Value *IndexOp = InsElt.getOperand(2); + + // inselt (inselt BaseVec, (trunc X), Index0), (trunc (lshr X, BW/2)), Index1 + // TODO: The insertion order could be reversed. + // TODO: Detect smaller fractions of the scalar. + // TODO: One-use checks are conservative. + auto *VTy = dyn_cast(InsElt.getType()); + Value *X, *BaseVec; + uint64_t ShAmt, Index0, Index1; + if (!VTy || (VTy->getNumElements() & 1) || + !match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec), m_Trunc(m_Value(X)), + m_ConstantInt(Index0)))) || + !match(ScalarOp, m_OneUse(m_Trunc(m_LShr(m_Specific(X), + m_ConstantInt(ShAmt))))) || + !match(IndexOp, m_ConstantInt(Index1))) + return nullptr; + + Type *SrcTy = X->getType(); + unsigned ScalarWidth = SrcTy->getScalarSizeInBits(); + unsigned VecEltWidth = VTy->getScalarSizeInBits(); + if (ScalarWidth != VecEltWidth * 2 || ShAmt != VecEltWidth) + return nullptr; + + // The low half must be inserted at element +1 for big-endian. + // The high half must be inserted at element +1 for little-endian + if (IsBigEndian ? Index0 != Index1 + 1 : Index0 + 1 != Index1) + return nullptr; + + // The high half must be inserted at an even element for big-endian. + // The low half must be inserted at an even element for little-endian. + if (IsBigEndian ? Index1 & 1 : Index0 & 1) + return nullptr; + + // Bitcast the base vector to a vector type with the source element type. + Type *CastTy = FixedVectorType::get(SrcTy, VTy->getNumElements() / 2); + Value *CastBaseVec = Builder.CreateBitCast(BaseVec, CastTy); + + // Scale the insert index for a vector with half as many elements. + // bitcast (inselt (bitcast BaseVec), X, NewIndex) + uint64_t NewIndex = IsBigEndian ? Index1 / 2 : Index0 / 2; + Value *NewInsert = Builder.CreateInsertElement(CastBaseVec, X, NewIndex); + return new BitCastInst(NewInsert, VTy); +} + Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) { Value *VecOp = IE.getOperand(0); Value *ScalarOp = IE.getOperand(1); @@ -1642,6 +1693,9 @@ if (Instruction *Ext = narrowInsElt(IE, Builder)) return Ext; + if (Instruction *Ext = foldTruncInsEltPair(IE, DL.isBigEndian(), Builder)) + return Ext; + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll --- a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll +++ b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll @@ -1,16 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL -; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL +; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL,BE +; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL,LE +declare void @use(i16) +declare void @use_vec(<8 x i16>) + define <4 x i16> @insert_01_poison_v4i16(i32 %x) { -; ALL-LABEL: @insert_01_poison_v4i16( -; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 -; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 -; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1 -; ALL-NEXT: ret <4 x i16> [[INS1]] +; BE-LABEL: @insert_01_poison_v4i16( +; BE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; BE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; BE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; BE-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0 +; BE-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1 +; BE-NEXT: ret <4 x i16> [[INS1]] +; +; LE-LABEL: @insert_01_poison_v4i16( +; LE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0 +; LE-NEXT: [[INS1:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16> +; LE-NEXT: ret <4 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 %hi16 = trunc i32 %hi32 to i16 @@ -21,13 +29,18 @@ } define <8 x i16> @insert_10_poison_v8i16(i32 %x) { -; ALL-LABEL: @insert_10_poison_v8i16( -; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 -; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 -; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1 -; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0 -; ALL-NEXT: ret <8 x i16> [[INS1]] +; BE-LABEL: @insert_10_poison_v8i16( +; BE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 +; BE-NEXT: [[INS1:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x i16> +; BE-NEXT: ret <8 x i16> [[INS1]] +; +; LE-LABEL: @insert_10_poison_v8i16( +; LE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; LE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; LE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; LE-NEXT: [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1 +; LE-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0 +; LE-NEXT: ret <8 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 %hi16 = trunc i32 %hi32 to i16 @@ -37,6 +50,8 @@ ret <8 x i16> %ins1 } +; negative test - larger element is not aligned in the vector + define <4 x i32> @insert_12_poison_v4i32(i64 %x) { ; ALL-LABEL: @insert_12_poison_v4i32( ; ALL-NEXT: [[HI64:%.*]] = lshr i64 [[X:%.*]], 32 @@ -54,6 +69,8 @@ ret <4 x i32> %ins1 } +; negative test - larger element is not aligned in the vector + define <4 x i16> @insert_21_poison_v4i16(i32 %x) { ; ALL-LABEL: @insert_21_poison_v4i16( ; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 @@ -72,13 +89,18 @@ } define <4 x i32> @insert_23_poison_v4i32(i64 %x) { -; ALL-LABEL: @insert_23_poison_v4i32( -; ALL-NEXT: [[HI64:%.*]] = lshr i64 [[X:%.*]], 32 -; ALL-NEXT: [[HI32:%.*]] = trunc i64 [[HI64]] to i32 -; ALL-NEXT: [[LO32:%.*]] = trunc i64 [[X]] to i32 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i32> poison, i32 [[LO32]], i64 2 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3 -; ALL-NEXT: ret <4 x i32> [[INS1]] +; BE-LABEL: @insert_23_poison_v4i32( +; BE-NEXT: [[HI64:%.*]] = lshr i64 [[X:%.*]], 32 +; BE-NEXT: [[HI32:%.*]] = trunc i64 [[HI64]] to i32 +; BE-NEXT: [[LO32:%.*]] = trunc i64 [[X]] to i32 +; BE-NEXT: [[INS0:%.*]] = insertelement <4 x i32> poison, i32 [[LO32]], i64 2 +; BE-NEXT: [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3 +; BE-NEXT: ret <4 x i32> [[INS1]] +; +; LE-LABEL: @insert_23_poison_v4i32( +; LE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i64 1 +; LE-NEXT: [[INS1:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32> +; LE-NEXT: ret <4 x i32> [[INS1]] ; %hi64 = lshr i64 %x, 32 %hi32 = trunc i64 %hi64 to i32 @@ -89,13 +111,18 @@ } define <4 x i16> @insert_32_poison_v4i16(i32 %x) { -; ALL-LABEL: @insert_32_poison_v4i16( -; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 -; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 -; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2 -; ALL-NEXT: ret <4 x i16> [[INS1]] +; BE-LABEL: @insert_32_poison_v4i16( +; BE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 1 +; BE-NEXT: [[INS1:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16> +; BE-NEXT: ret <4 x i16> [[INS1]] +; +; LE-LABEL: @insert_32_poison_v4i16( +; LE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; LE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; LE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; LE-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3 +; LE-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2 +; LE-NEXT: ret <4 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 %hi16 = trunc i32 %hi32 to i16 @@ -105,14 +132,23 @@ ret <4 x i16> %ins1 } +; Similar to the above tests but with a non-poison base vector. + +; Vector is same size as scalar, so this is just a cast. +; TODO: Could be swapped/rotated into place. + define <2 x i16> @insert_01_v2i16(i32 %x, <2 x i16> %v) { -; ALL-LABEL: @insert_01_v2i16( -; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 -; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 -; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <2 x i16> poison, i16 [[LO16]], i64 0 -; ALL-NEXT: [[INS1:%.*]] = insertelement <2 x i16> [[INS0]], i16 [[HI16]], i64 1 -; ALL-NEXT: ret <2 x i16> [[INS1]] +; BE-LABEL: @insert_01_v2i16( +; BE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; BE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; BE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; BE-NEXT: [[INS0:%.*]] = insertelement <2 x i16> poison, i16 [[LO16]], i64 0 +; BE-NEXT: [[INS1:%.*]] = insertelement <2 x i16> [[INS0]], i16 [[HI16]], i64 1 +; BE-NEXT: ret <2 x i16> [[INS1]] +; +; LE-LABEL: @insert_01_v2i16( +; LE-NEXT: [[INS1:%.*]] = bitcast i32 [[X:%.*]] to <2 x i16> +; LE-NEXT: ret <2 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 %hi16 = trunc i32 %hi32 to i16 @@ -123,13 +159,19 @@ } define <8 x i16> @insert_10_v8i16(i32 %x, <8 x i16> %v) { -; ALL-LABEL: @insert_10_v8i16( -; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 -; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 -; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1 -; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0 -; ALL-NEXT: ret <8 x i16> [[INS1]] +; BE-LABEL: @insert_10_v8i16( +; BE-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32> +; BE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X:%.*]], i64 0 +; BE-NEXT: [[INS1:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; BE-NEXT: ret <8 x i16> [[INS1]] +; +; LE-LABEL: @insert_10_v8i16( +; LE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; LE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; LE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; LE-NEXT: [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1 +; LE-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0 +; LE-NEXT: ret <8 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 %hi16 = trunc i32 %hi32 to i16 @@ -139,6 +181,8 @@ ret <8 x i16> %ins1 } +; negative test - larger element is not aligned in the vector + define <4 x i32> @insert_12_v4i32(i64 %x, <4 x i32> %v) { ; ALL-LABEL: @insert_12_v4i32( ; ALL-NEXT: [[HI64:%.*]] = lshr i64 [[X:%.*]], 32 @@ -156,6 +200,8 @@ ret <4 x i32> %ins1 } +; negative test - larger element is not aligned in the vector + define <4 x i16> @insert_21_v4i16(i32 %x, <4 x i16> %v) { ; ALL-LABEL: @insert_21_v4i16( ; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 @@ -174,13 +220,19 @@ } define <4 x i32> @insert_23_v4i32(i64 %x, <4 x i32> %v) { -; ALL-LABEL: @insert_23_v4i32( -; ALL-NEXT: [[HI64:%.*]] = lshr i64 [[X:%.*]], 32 -; ALL-NEXT: [[HI32:%.*]] = trunc i64 [[HI64]] to i32 -; ALL-NEXT: [[LO32:%.*]] = trunc i64 [[X]] to i32 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i32> [[V:%.*]], i32 [[LO32]], i64 2 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3 -; ALL-NEXT: ret <4 x i32> [[INS1]] +; BE-LABEL: @insert_23_v4i32( +; BE-NEXT: [[HI64:%.*]] = lshr i64 [[X:%.*]], 32 +; BE-NEXT: [[HI32:%.*]] = trunc i64 [[HI64]] to i32 +; BE-NEXT: [[LO32:%.*]] = trunc i64 [[X]] to i32 +; BE-NEXT: [[INS0:%.*]] = insertelement <4 x i32> [[V:%.*]], i32 [[LO32]], i64 2 +; BE-NEXT: [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3 +; BE-NEXT: ret <4 x i32> [[INS1]] +; +; LE-LABEL: @insert_23_v4i32( +; LE-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <2 x i64> +; LE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[X:%.*]], i64 1 +; LE-NEXT: [[INS1:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32> +; LE-NEXT: ret <4 x i32> [[INS1]] ; %hi64 = lshr i64 %x, 32 %hi32 = trunc i64 %hi64 to i32 @@ -191,13 +243,19 @@ } define <4 x i16> @insert_32_v4i16(i32 %x, <4 x i16> %v) { -; ALL-LABEL: @insert_32_v4i16( -; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 -; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 -; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2 -; ALL-NEXT: ret <4 x i16> [[INS1]] +; BE-LABEL: @insert_32_v4i16( +; BE-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <2 x i32> +; BE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X:%.*]], i64 1 +; BE-NEXT: [[INS1:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x i16> +; BE-NEXT: ret <4 x i16> [[INS1]] +; +; LE-LABEL: @insert_32_v4i16( +; LE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; LE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; LE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; LE-NEXT: [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3 +; LE-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2 +; LE-NEXT: ret <4 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 %hi16 = trunc i32 %hi32 to i16 @@ -206,3 +264,131 @@ %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 2 ret <4 x i16> %ins1 } + +; negative test - need half-width shift + +define <4 x i16> @insert_01_v4i16_wrong_shift1(i32 %x) { +; ALL-LABEL: @insert_01_v4i16_wrong_shift1( +; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 8 +; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0 +; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1 +; ALL-NEXT: ret <4 x i16> [[INS1]] +; + %hi32 = lshr i32 %x, 8 + %hi16 = trunc i32 %hi32 to i16 + %lo16 = trunc i32 %x to i16 + %ins0 = insertelement <4 x i16> poison, i16 %lo16, i64 0 + %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 1 + ret <4 x i16> %ins1 +} + +; negative test - need common scalar + +define <4 x i16> @insert_01_v4i16_wrong_op(i32 %x, i32 %y) { +; ALL-LABEL: @insert_01_v4i16_wrong_op( +; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[Y:%.*]] to i16 +; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0 +; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1 +; ALL-NEXT: ret <4 x i16> [[INS1]] +; + %hi32 = lshr i32 %x, 16 + %hi16 = trunc i32 %hi32 to i16 + %lo16 = trunc i32 %y to i16 + %ins0 = insertelement <4 x i16> poison, i16 %lo16, i64 0 + %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 1 + ret <4 x i16> %ins1 +} + +; TODO: extra use doesn't have to prevent the fold. + +define <8 x i16> @insert_67_v4i16_uses1(i32 %x, <8 x i16> %v) { +; ALL-LABEL: @insert_67_v4i16_uses1( +; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; ALL-NEXT: call void @use(i16 [[HI16]]) +; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; ALL-NEXT: [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 6 +; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 7 +; ALL-NEXT: ret <8 x i16> [[INS1]] +; + %hi32 = lshr i32 %x, 16 + %hi16 = trunc i32 %hi32 to i16 + call void @use(i16 %hi16) + %lo16 = trunc i32 %x to i16 + %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 6 + %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 7 + ret <8 x i16> %ins1 +} + +; extra use is ok + +define <8 x i16> @insert_76_v4i16_uses2(i32 %x, <8 x i16> %v) { +; BE-LABEL: @insert_76_v4i16_uses2( +; BE-NEXT: [[LO16:%.*]] = trunc i32 [[X:%.*]] to i16 +; BE-NEXT: call void @use(i16 [[LO16]]) +; BE-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32> +; BE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X]], i64 3 +; BE-NEXT: [[INS1:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; BE-NEXT: ret <8 x i16> [[INS1]] +; +; LE-LABEL: @insert_76_v4i16_uses2( +; LE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; LE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; LE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; LE-NEXT: call void @use(i16 [[LO16]]) +; LE-NEXT: [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 7 +; LE-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 6 +; LE-NEXT: ret <8 x i16> [[INS1]] +; + %hi32 = lshr i32 %x, 16 + %hi16 = trunc i32 %hi32 to i16 + %lo16 = trunc i32 %x to i16 + call void @use(i16 %lo16) + %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 7 + %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 6 + ret <8 x i16> %ins1 +} + +; TODO: extra use doesn't have to prevent the fold. + +define <8 x i16> @insert_67_v4i16_uses3(i32 %x, <8 x i16> %v) { +; ALL-LABEL: @insert_67_v4i16_uses3( +; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; ALL-NEXT: [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 6 +; ALL-NEXT: call void @use_vec(<8 x i16> [[INS0]]) +; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 7 +; ALL-NEXT: ret <8 x i16> [[INS1]] +; + %hi32 = lshr i32 %x, 16 + %hi16 = trunc i32 %hi32 to i16 + %lo16 = trunc i32 %x to i16 + %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 6 + call void @use_vec(<8 x i16> %ins0) + %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 7 + ret <8 x i16> %ins1 +} + +; TODO: This is equivalent to the 1st test. + +define <4 x i16> @insert_01_poison_v4i16_high_first(i32 %x) { +; ALL-LABEL: @insert_01_poison_v4i16_high_first( +; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 1 +; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> [[INS1]], i16 [[LO16]], i64 0 +; ALL-NEXT: ret <4 x i16> [[INS0]] +; + %hi32 = lshr i32 %x, 16 + %hi16 = trunc i32 %hi32 to i16 + %lo16 = trunc i32 %x to i16 + %ins1 = insertelement <4 x i16> poison, i16 %hi16, i64 1 + %ins0 = insertelement <4 x i16> %ins1, i16 %lo16, i64 0 + ret <4 x i16> %ins0 +} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll @@ -47,34 +47,24 @@ define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 { ; SSE-LABEL: @ConvertVectors_ByVal( ; SSE-NEXT: entry: -; SSE-NEXT: [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16 -; SSE-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8 -; SSE-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8 -; SSE-NEXT: [[TMP1:%.*]] = lshr i64 [[V_VAL20]], 32 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[V_VAL20]], i64 0 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP1]], i64 1 -; SSE-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3 -; SSE-NEXT: [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +; SSE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16 +; SSE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8 +; SSE-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8 +; SSE-NEXT: [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32 +; SSE-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float +; SSE-NEXT: [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2 +; SSE-NEXT: [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3 ; SSE-NEXT: ret <4 x float> [[VECINIT16]] ; ; AVX-LABEL: @ConvertVectors_ByVal( ; AVX-NEXT: entry: -; AVX-NEXT: [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16 -; AVX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8 -; AVX-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8 -; AVX-NEXT: [[TMP1:%.*]] = trunc i64 [[V_VAL20]] to i32 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP3:%.*]] = lshr i64 [[V_VAL20]], 32 -; AVX-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i64 1 -; AVX-NEXT: [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3 -; AVX-NEXT: [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +; AVX-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16 +; AVX-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8 +; AVX-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8 +; AVX-NEXT: [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float +; AVX-NEXT: [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2 +; AVX-NEXT: [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3 ; AVX-NEXT: ret <4 x float> [[VECINIT16]] ; entry: