diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -516,6 +516,7 @@ SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); SDValue replaceStoreOfFPConstant(StoreSDNode *ST); + SDValue replaceStoreOfInsertLoad(StoreSDNode *ST); bool refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(SDNode *N); @@ -20409,6 +20410,62 @@ } } +// (store (insert_vector_elt (load p), x, i), p) -> (store x, p+offset) +// +// If a store of a load with an element inserted into it has no other +// uses in between the chain, then we can consider the vector store +// dead and replace it with just the single scalar element store. +SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) { + SDLoc DL(ST); + SDValue Value = ST->getValue(); + SDValue Ptr = ST->getBasePtr(); + SDValue Chain = ST->getChain(); + if (Value.getOpcode() != ISD::INSERT_VECTOR_ELT || !Value.hasOneUse()) + return SDValue(); + + SDValue Elt = Value.getOperand(1); + SDValue Idx = Value.getOperand(2); + + // If the element isn't byte sized then we can't compute an offset + EVT EltVT = Elt.getValueType(); + if (!EltVT.isByteSized()) + return SDValue(); + + auto *Ld = dyn_cast(Value.getOperand(0)); + if (!Ld || Ld->getBasePtr() != Ptr || + ST->getMemoryVT() != Ld->getMemoryVT() || !ST->isSimple() || + !ISD::isNormalStore(ST) || + Ld->getAddressSpace() != ST->getAddressSpace() || + !Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) + return SDValue(); + + unsigned IsFast; + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + Elt.getValueType(), ST->getAddressSpace(), + ST->getAlign(), ST->getMemOperand()->getFlags(), + &IsFast) || + !IsFast) + return SDValue(); + EVT PtrVT = Ptr.getValueType(); + + SDValue Offset = + DAG.getNode(ISD::MUL, DL, PtrVT, Idx, + DAG.getConstant(EltVT.getSizeInBits() / 8, DL, PtrVT)); + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Offset); + MachinePointerInfo PointerInfo(ST->getAddressSpace()); + + // If the offset is a known constant then try to recover the pointer + // info + if (auto *CIdx = dyn_cast(Idx)) { + unsigned COffset = CIdx->getSExtValue() * EltVT.getSizeInBits() / 8; + NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(COffset), DL); + PointerInfo = ST->getPointerInfo().getWithOffset(COffset); + } + + return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(), + ST->getMemOperand()->getFlags()); +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); @@ -20548,6 +20605,10 @@ } } + // Try scalarizing vector stores of loads where we only change one element + if (SDValue NewST = replaceStoreOfInsertLoad(ST)) + return NewST; + // TODO: Can relax for unordered atomics (see D66309) if (StoreSDNode *ST1 = dyn_cast(Chain)) { if (ST->isUnindexed() && ST->isSimple() && diff --git a/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll --- a/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll +++ b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll @@ -13,9 +13,7 @@ ; CHECK-NEXT: ; %bb.1: ; %bb1 ; CHECK-NEXT: ldr d0, [x1] ; CHECK-NEXT: LBB0_2: ; %bb2 -; CHECK-NEXT: ldr q1, [x8] -; CHECK-NEXT: mov.d v1[0], v0[0] -; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ret entry: br i1 %c, label %bb1, label %bb2 diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll --- a/llvm/test/CodeGen/ARM/fp16-promote.ll +++ b/llvm/test/CodeGen/ARM/fp16-promote.ll @@ -865,8 +865,14 @@ ; CHECK-VFP: ldrh ; CHECK-VFP: stm ; CHECK-VFP: strh -; CHECK-VFP: ldm -; CHECK-VFP: stm +; CHECK-VFP: ldrh +; CHECK-VFP: ldrh +; CHECK-VFP: ldrh +; CHECK-VFP: ldrh +; CHECK-VFP: strh +; CHECK-VFP: strh +; CHECK-VFP: strh +; CHECK-VFP: strh ; CHECK-NOVFP: ldrh ; CHECK-NOVFP: ldrh @@ -893,7 +899,7 @@ %a = load half, ptr %p, align 2 %b = load <4 x half>, ptr %q, align 8 %c = insertelement <4 x half> %b, half %a, i32 %i - store <4 x half> %c, ptr %q + store volatile <4 x half> %c, ptr %q ret void } diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll --- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll @@ -134,9 +134,8 @@ define void @i64_insertelement(ptr %ptr, ptr %vp) nounwind { ; CHECK-LABEL: i64_insertelement: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NEXT: ldm r0, {r2, r3} +; CHECK-NEXT: strd r2, r3, [r1] ; CHECK-NEXT: bx lr %t0 = load i64, ptr %ptr, align 4 %vec = load <2 x i64>, ptr %vp diff --git a/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll b/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll --- a/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll @@ -9,7 +9,7 @@ b0: %v1 = load <64 x half>, ptr %v0, align 2 %v2 = insertelement <64 x half> %v1, half 0xH4170, i32 17 - store <64 x half> %v2, ptr %v0, align 2 + store volatile <64 x half> %v2, ptr %v0, align 2 ret ptr %v0 } diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll --- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll +++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll @@ -1740,10 +1740,8 @@ ; O32-NEXT: addiu $2, $2, %lo(_gp_disp) ; O32-NEXT: addu $1, $2, $25 ; O32-NEXT: lw $1, %got(v16i8)($1) -; O32-NEXT: ld.b $w0, 0($1) -; O32-NEXT: insert.b $w0[1], $4 ; O32-NEXT: jr $ra -; O32-NEXT: st.b $w0, 0($1) +; O32-NEXT: sb $4, 1($1) ; ; N32-LABEL: insert_v16i8: ; N32: # %bb.0: @@ -1751,10 +1749,8 @@ ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8))) ; N32-NEXT: lw $1, %got_disp(v16i8)($1) -; N32-NEXT: ld.b $w0, 0($1) -; N32-NEXT: insert.b $w0[1], $4 ; N32-NEXT: jr $ra -; N32-NEXT: st.b $w0, 0($1) +; N32-NEXT: sb $4, 1($1) ; ; N64-LABEL: insert_v16i8: ; N64: # %bb.0: @@ -1762,10 +1758,8 @@ ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8))) ; N64-NEXT: ld $1, %got_disp(v16i8)($1) -; N64-NEXT: ld.b $w0, 0($1) -; N64-NEXT: insert.b $w0[1], $4 ; N64-NEXT: jr $ra -; N64-NEXT: st.b $w0, 0($1) +; N64-NEXT: sb $4, 1($1) %1 = load <16 x i8>, ptr @v16i8 %a2 = trunc i32 %a to i8 %a3 = sext i8 %a2 to i32 @@ -1782,10 +1776,8 @@ ; O32-NEXT: addiu $2, $2, %lo(_gp_disp) ; O32-NEXT: addu $1, $2, $25 ; O32-NEXT: lw $1, %got(v8i16)($1) -; O32-NEXT: ld.h $w0, 0($1) -; O32-NEXT: insert.h $w0[1], $4 ; O32-NEXT: jr $ra -; O32-NEXT: st.h $w0, 0($1) +; O32-NEXT: sh $4, 2($1) ; ; N32-LABEL: insert_v8i16: ; N32: # %bb.0: @@ -1793,10 +1785,8 @@ ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16))) ; N32-NEXT: lw $1, %got_disp(v8i16)($1) -; N32-NEXT: ld.h $w0, 0($1) -; N32-NEXT: insert.h $w0[1], $4 ; N32-NEXT: jr $ra -; N32-NEXT: st.h $w0, 0($1) +; N32-NEXT: sh $4, 2($1) ; ; N64-LABEL: insert_v8i16: ; N64: # %bb.0: @@ -1804,10 +1794,8 @@ ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16))) ; N64-NEXT: ld $1, %got_disp(v8i16)($1) -; N64-NEXT: ld.h $w0, 0($1) -; N64-NEXT: insert.h $w0[1], $4 ; N64-NEXT: jr $ra -; N64-NEXT: st.h $w0, 0($1) +; N64-NEXT: sh $4, 2($1) %1 = load <8 x i16>, ptr @v8i16 %a2 = trunc i32 %a to i16 %a3 = sext i16 %a2 to i32 @@ -1824,10 +1812,8 @@ ; O32-NEXT: addiu $2, $2, %lo(_gp_disp) ; O32-NEXT: addu $1, $2, $25 ; O32-NEXT: lw $1, %got(v4i32)($1) -; O32-NEXT: ld.w $w0, 0($1) -; O32-NEXT: insert.w $w0[1], $4 ; O32-NEXT: jr $ra -; O32-NEXT: st.w $w0, 0($1) +; O32-NEXT: sw $4, 4($1) ; ; N32-LABEL: insert_v4i32: ; N32: # %bb.0: @@ -1835,10 +1821,8 @@ ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32))) ; N32-NEXT: lw $1, %got_disp(v4i32)($1) -; N32-NEXT: ld.w $w0, 0($1) -; N32-NEXT: insert.w $w0[1], $4 ; N32-NEXT: jr $ra -; N32-NEXT: st.w $w0, 0($1) +; N32-NEXT: sw $4, 4($1) ; ; N64-LABEL: insert_v4i32: ; N64: # %bb.0: @@ -1846,10 +1830,8 @@ ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32))) ; N64-NEXT: ld $1, %got_disp(v4i32)($1) -; N64-NEXT: ld.w $w0, 0($1) -; N64-NEXT: insert.w $w0[1], $4 ; N64-NEXT: jr $ra -; N64-NEXT: st.w $w0, 0($1) +; N64-NEXT: sw $4, 4($1) %1 = load <4 x i32>, ptr @v4i32 %2 = insertelement <4 x i32> %1, i32 %a, i32 1 store <4 x i32> %2, ptr @v4i32 @@ -1862,11 +1844,9 @@ ; O32-NEXT: addiu $2, $2, %lo(_gp_disp) ; O32-NEXT: addu $1, $2, $25 ; O32-NEXT: lw $1, %got(v2i64)($1) -; O32-NEXT: ld.w $w0, 0($1) -; O32-NEXT: insert.w $w0[2], $4 -; O32-NEXT: insert.w $w0[3], $5 +; O32-NEXT: sw $5, 12($1) ; O32-NEXT: jr $ra -; O32-NEXT: st.w $w0, 0($1) +; O32-NEXT: sw $4, 8($1) ; ; N32-LABEL: insert_v2i64: ; N32: # %bb.0: @@ -1874,10 +1854,8 @@ ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64))) ; N32-NEXT: lw $1, %got_disp(v2i64)($1) -; N32-NEXT: ld.d $w0, 0($1) -; N32-NEXT: insert.d $w0[1], $4 ; N32-NEXT: jr $ra -; N32-NEXT: st.d $w0, 0($1) +; N32-NEXT: sd $4, 8($1) ; ; N64-LABEL: insert_v2i64: ; N64: # %bb.0: @@ -1885,10 +1863,8 @@ ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64))) ; N64-NEXT: ld $1, %got_disp(v2i64)($1) -; N64-NEXT: ld.d $w0, 0($1) -; N64-NEXT: insert.d $w0[1], $4 ; N64-NEXT: jr $ra -; N64-NEXT: st.d $w0, 0($1) +; N64-NEXT: sd $4, 8($1) %1 = load <2 x i64>, ptr @v2i64 %2 = insertelement <2 x i64> %1, i64 %a, i32 1 store <2 x i64> %2, ptr @v2i64 @@ -1904,13 +1880,9 @@ ; O32-NEXT: lw $2, %got(i32)($1) ; O32-NEXT: lw $2, 0($2) ; O32-NEXT: lw $1, %got(v16i8)($1) -; O32-NEXT: ld.b $w0, 0($1) -; O32-NEXT: sld.b $w0, $w0[$2] -; O32-NEXT: insert.b $w0[0], $4 -; O32-NEXT: neg $2, $2 -; O32-NEXT: sld.b $w0, $w0[$2] +; O32-NEXT: addu $1, $1, $2 ; O32-NEXT: jr $ra -; O32-NEXT: st.b $w0, 0($1) +; O32-NEXT: sb $4, 0($1) ; ; N32-LABEL: insert_v16i8_vidx: ; N32: # %bb.0: @@ -1920,13 +1892,9 @@ ; N32-NEXT: lw $2, %got_disp(i32)($1) ; N32-NEXT: lw $2, 0($2) ; N32-NEXT: lw $1, %got_disp(v16i8)($1) -; N32-NEXT: ld.b $w0, 0($1) -; N32-NEXT: sld.b $w0, $w0[$2] -; N32-NEXT: insert.b $w0[0], $4 -; N32-NEXT: neg $2, $2 -; N32-NEXT: sld.b $w0, $w0[$2] +; N32-NEXT: addu $1, $1, $2 ; N32-NEXT: jr $ra -; N32-NEXT: st.b $w0, 0($1) +; N32-NEXT: sb $4, 0($1) ; ; N64-LABEL: insert_v16i8_vidx: ; N64: # %bb.0: @@ -1936,13 +1904,9 @@ ; N64-NEXT: ld $2, %got_disp(i32)($1) ; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: ld $1, %got_disp(v16i8)($1) -; N64-NEXT: ld.b $w0, 0($1) -; N64-NEXT: sld.b $w0, $w0[$2] -; N64-NEXT: insert.b $w0[0], $4 -; N64-NEXT: dneg $2, $2 -; N64-NEXT: sld.b $w0, $w0[$2] +; N64-NEXT: daddu $1, $1, $2 ; N64-NEXT: jr $ra -; N64-NEXT: st.b $w0, 0($1) +; N64-NEXT: sb $4, 0($1) %1 = load <16 x i8>, ptr @v16i8 %2 = load i32, ptr @i32 %a2 = trunc i32 %a to i8 @@ -1962,14 +1926,9 @@ ; O32-NEXT: lw $2, %got(i32)($1) ; O32-NEXT: lw $2, 0($2) ; O32-NEXT: lw $1, %got(v8i16)($1) -; O32-NEXT: ld.h $w0, 0($1) -; O32-NEXT: sll $2, $2, 1 -; O32-NEXT: sld.b $w0, $w0[$2] -; O32-NEXT: insert.h $w0[0], $4 -; O32-NEXT: neg $2, $2 -; O32-NEXT: sld.b $w0, $w0[$2] +; O32-NEXT: lsa $1, $2, $1, 1 ; O32-NEXT: jr $ra -; O32-NEXT: st.h $w0, 0($1) +; O32-NEXT: sh $4, 0($1) ; ; N32-LABEL: insert_v8i16_vidx: ; N32: # %bb.0: @@ -1979,14 +1938,9 @@ ; N32-NEXT: lw $2, %got_disp(i32)($1) ; N32-NEXT: lw $2, 0($2) ; N32-NEXT: lw $1, %got_disp(v8i16)($1) -; N32-NEXT: ld.h $w0, 0($1) -; N32-NEXT: sll $2, $2, 1 -; N32-NEXT: sld.b $w0, $w0[$2] -; N32-NEXT: insert.h $w0[0], $4 -; N32-NEXT: neg $2, $2 -; N32-NEXT: sld.b $w0, $w0[$2] +; N32-NEXT: lsa $1, $2, $1, 1 ; N32-NEXT: jr $ra -; N32-NEXT: st.h $w0, 0($1) +; N32-NEXT: sh $4, 0($1) ; ; N64-LABEL: insert_v8i16_vidx: ; N64: # %bb.0: @@ -1996,14 +1950,9 @@ ; N64-NEXT: ld $2, %got_disp(i32)($1) ; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: ld $1, %got_disp(v8i16)($1) -; N64-NEXT: ld.h $w0, 0($1) -; N64-NEXT: dsll $2, $2, 1 -; N64-NEXT: sld.b $w0, $w0[$2] -; N64-NEXT: insert.h $w0[0], $4 -; N64-NEXT: dneg $2, $2 -; N64-NEXT: sld.b $w0, $w0[$2] +; N64-NEXT: dlsa $1, $2, $1, 1 ; N64-NEXT: jr $ra -; N64-NEXT: st.h $w0, 0($1) +; N64-NEXT: sh $4, 0($1) %1 = load <8 x i16>, ptr @v8i16 %2 = load i32, ptr @i32 %a2 = trunc i32 %a to i16 @@ -2023,14 +1972,9 @@ ; O32-NEXT: lw $2, %got(i32)($1) ; O32-NEXT: lw $2, 0($2) ; O32-NEXT: lw $1, %got(v4i32)($1) -; O32-NEXT: ld.w $w0, 0($1) -; O32-NEXT: sll $2, $2, 2 -; O32-NEXT: sld.b $w0, $w0[$2] -; O32-NEXT: insert.w $w0[0], $4 -; O32-NEXT: neg $2, $2 -; O32-NEXT: sld.b $w0, $w0[$2] +; O32-NEXT: lsa $1, $2, $1, 2 ; O32-NEXT: jr $ra -; O32-NEXT: st.w $w0, 0($1) +; O32-NEXT: sw $4, 0($1) ; ; N32-LABEL: insert_v4i32_vidx: ; N32: # %bb.0: @@ -2040,14 +1984,9 @@ ; N32-NEXT: lw $2, %got_disp(i32)($1) ; N32-NEXT: lw $2, 0($2) ; N32-NEXT: lw $1, %got_disp(v4i32)($1) -; N32-NEXT: ld.w $w0, 0($1) -; N32-NEXT: sll $2, $2, 2 -; N32-NEXT: sld.b $w0, $w0[$2] -; N32-NEXT: insert.w $w0[0], $4 -; N32-NEXT: neg $2, $2 -; N32-NEXT: sld.b $w0, $w0[$2] +; N32-NEXT: lsa $1, $2, $1, 2 ; N32-NEXT: jr $ra -; N32-NEXT: st.w $w0, 0($1) +; N32-NEXT: sw $4, 0($1) ; ; N64-LABEL: insert_v4i32_vidx: ; N64: # %bb.0: @@ -2057,14 +1996,9 @@ ; N64-NEXT: ld $2, %got_disp(i32)($1) ; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: ld $1, %got_disp(v4i32)($1) -; N64-NEXT: ld.w $w0, 0($1) -; N64-NEXT: dsll $2, $2, 2 -; N64-NEXT: sld.b $w0, $w0[$2] -; N64-NEXT: insert.w $w0[0], $4 -; N64-NEXT: dneg $2, $2 -; N64-NEXT: sld.b $w0, $w0[$2] +; N64-NEXT: dlsa $1, $2, $1, 2 ; N64-NEXT: jr $ra -; N64-NEXT: st.w $w0, 0($1) +; N64-NEXT: sw $4, 0($1) %1 = load <4 x i32>, ptr @v4i32 %2 = load i32, ptr @i32 %3 = insertelement <4 x i32> %1, i32 %a, i32 %2 @@ -2084,22 +2018,11 @@ ; O32-NEXT: addu $1, $2, $25 ; O32-NEXT: lw $2, %got(i32)($1) ; O32-NEXT: lw $2, 0($2) -; O32-NEXT: addu $2, $2, $2 ; O32-NEXT: lw $1, %got(v2i64)($1) -; O32-NEXT: ld.w $w0, 0($1) -; O32-NEXT: sll $3, $2, 2 -; O32-NEXT: sld.b $w0, $w0[$3] -; O32-NEXT: insert.w $w0[0], $4 -; O32-NEXT: neg $3, $3 -; O32-NEXT: sld.b $w0, $w0[$3] -; O32-NEXT: addiu $2, $2, 1 -; O32-NEXT: sll $2, $2, 2 -; O32-NEXT: sld.b $w0, $w0[$2] -; O32-NEXT: insert.w $w0[0], $5 -; O32-NEXT: neg $2, $2 -; O32-NEXT: sld.b $w0, $w0[$2] +; O32-NEXT: lsa $1, $2, $1, 3 +; O32-NEXT: sw $5, 4($1) ; O32-NEXT: jr $ra -; O32-NEXT: st.w $w0, 0($1) +; O32-NEXT: sw $4, 0($1) ; ; N32-LABEL: insert_v2i64_vidx: ; N32: # %bb.0: @@ -2109,14 +2032,9 @@ ; N32-NEXT: lw $2, %got_disp(i32)($1) ; N32-NEXT: lw $2, 0($2) ; N32-NEXT: lw $1, %got_disp(v2i64)($1) -; N32-NEXT: ld.d $w0, 0($1) -; N32-NEXT: sll $2, $2, 3 -; N32-NEXT: sld.b $w0, $w0[$2] -; N32-NEXT: insert.d $w0[0], $4 -; N32-NEXT: neg $2, $2 -; N32-NEXT: sld.b $w0, $w0[$2] +; N32-NEXT: lsa $1, $2, $1, 3 ; N32-NEXT: jr $ra -; N32-NEXT: st.d $w0, 0($1) +; N32-NEXT: sd $4, 0($1) ; ; N64-LABEL: insert_v2i64_vidx: ; N64: # %bb.0: @@ -2126,14 +2044,9 @@ ; N64-NEXT: ld $2, %got_disp(i32)($1) ; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: ld $1, %got_disp(v2i64)($1) -; N64-NEXT: ld.d $w0, 0($1) -; N64-NEXT: dsll $2, $2, 3 -; N64-NEXT: sld.b $w0, $w0[$2] -; N64-NEXT: insert.d $w0[0], $4 -; N64-NEXT: dneg $2, $2 -; N64-NEXT: sld.b $w0, $w0[$2] +; N64-NEXT: dlsa $1, $2, $1, 3 ; N64-NEXT: jr $ra -; N64-NEXT: st.d $w0, 0($1) +; N64-NEXT: sd $4, 0($1) %1 = load <2 x i64>, ptr @v2i64 %2 = load i32, ptr @i32 %3 = insertelement <2 x i64> %1, i64 %a, i32 %2 diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll b/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll --- a/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll +++ b/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll @@ -275,7 +275,7 @@ ; float argument passed in $f12 ; ALL-DAG: insve.w [[R1]][1], $w12[0] - store <4 x float> %2, ptr @v4f32 + store volatile <4 x float> %2, ptr @v4f32 ; ALL-DAG: st.w [[R1]] ret void @@ -291,7 +291,7 @@ ; double argument passed in $f12 ; ALL-DAG: insve.d [[R1]][1], $w12[0] - store <2 x double> %2, ptr @v2f64 + store volatile <2 x double> %2, ptr @v2f64 ; ALL-DAG: st.d [[R1]] ret void @@ -319,7 +319,7 @@ ; ALL-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] ; ALL-DAG: sld.b [[R1]], [[R1]][[[NIDX]]] - store <4 x float> %3, ptr @v4f32 + store volatile <4 x float> %3, ptr @v4f32 ; ALL-DAG: st.w [[R1]] ret void @@ -347,7 +347,7 @@ ; ALL-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] ; ALL-DAG: sld.b [[R1]], [[R1]][[[NIDX]]] - store <2 x double> %3, ptr @v2f64 + store volatile <2 x double> %3, ptr @v2f64 ; ALL-DAG: st.d [[R1]] ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -28,23 +28,13 @@ define void @insertelt_v4i64_store(ptr %x, i64 %y) { ; RV32-LABEL: insertelt_v4i64_store: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v10, v8, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a2 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vslideup.vi v8, v10, 3 -; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: sw a2, 28(a0) +; RV32-NEXT: sw a1, 24(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v4i64_store: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: vmv.s.x v10, a1 -; RV64-NEXT: vslideup.vi v8, v10, 3 -; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: sd a1, 24(a0) ; RV64-NEXT: ret %a = load <4 x i64>, ptr %x %b = insertelement <4 x i64> %a, i64 %y, i32 3 @@ -96,24 +86,13 @@ define void @insertelt_v3i64_store(ptr %x, i64 %y) { ; RV32-LABEL: insertelt_v3i64_store: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 3, e64, m2, ta, ma -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v10, v8, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a2 -; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 2 -; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: sw a2, 20(a0) +; RV32-NEXT: sw a1, 16(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v3i64_store: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 3, e64, m2, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: vmv.s.x v10, a1 -; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 2 -; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: sd a1, 16(a0) ; RV64-NEXT: ret %a = load <3 x i64>, ptr %x, align 8 %b = insertelement <3 x i64> %a, i64 %y, i32 2 @@ -135,13 +114,7 @@ define void @insertelt_v16i8_store(ptr %x, i8 %y) { ; CHECK-LABEL: insertelt_v16i8_store: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, a1 -; CHECK-NEXT: vsetivli zero, 15, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 14 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: sb a1, 14(a0) ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> %a, i8 %y, i32 14 @@ -178,30 +151,17 @@ define void @insertelt_v32i16_store(ptr %x, i16 %y, i32 %idx) { ; RV32-LABEL: insertelt_v32i16_store: ; RV32: # %bb.0: -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; RV32-NEXT: vle16.v v8, (a0) -; RV32-NEXT: vmv.s.x v12, a1 -; RV32-NEXT: addi a1, a2, 1 -; RV32-NEXT: vsetvli zero, a1, e16, m4, tu, ma -; RV32-NEXT: vslideup.vx v8, v12, a2 -; RV32-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; RV32-NEXT: vse16.v v8, (a0) +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: sh a1, 0(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v32i16_store: ; RV64: # %bb.0: -; RV64-NEXT: li a3, 32 -; RV64-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; RV64-NEXT: vle16.v v8, (a0) -; RV64-NEXT: vmv.s.x v12, a1 ; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: srli a2, a2, 32 -; RV64-NEXT: addi a1, a2, 1 -; RV64-NEXT: vsetvli zero, a1, e16, m4, tu, ma -; RV64-NEXT: vslideup.vx v8, v12, a2 -; RV64-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; RV64-NEXT: vse16.v v8, (a0) +; RV64-NEXT: srli a2, a2, 31 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: sh a1, 0(a0) ; RV64-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> %a, i16 %y, i32 %idx @@ -236,28 +196,17 @@ define void @insertelt_v8f32_store(ptr %x, float %y, i32 %idx) { ; RV32-LABEL: insertelt_v8f32_store: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vfmv.s.f v10, fa0 -; RV32-NEXT: addi a2, a1, 1 -; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, ma -; RV32-NEXT: vslideup.vx v8, v10, a1 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: fsw fa0, 0(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v8f32_store: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vfmv.s.f v10, fa0 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: addi a2, a1, 1 -; RV64-NEXT: vsetvli zero, a2, e32, m2, tu, ma -; RV64-NEXT: vslideup.vx v8, v10, a1 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: srli a1, a1, 30 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: fsw fa0, 0(a0) ; RV64-NEXT: ret %a = load <8 x float>, ptr %x %b = insertelement <8 x float> %a, float %y, i32 %idx @@ -277,15 +226,18 @@ } define void @insertelt_v8i64_0_store(ptr %x) { -; CHECK-LABEL: insertelt_v8i64_0_store: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma -; CHECK-NEXT: vmv.s.x v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: insertelt_v8i64_0_store: +; RV32: # %bb.0: +; RV32-NEXT: li a1, -1 +; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_v8i64_0_store: +; RV64: # %bb.0: +; RV64-NEXT: li a1, -1 +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: ret %a = load <8 x i64>, ptr %x %b = insertelement <8 x i64> %a, i64 -1, i32 0 store <8 x i64> %b, ptr %x @@ -321,30 +273,20 @@ define void @insertelt_v8i64_store(ptr %x, i32 %idx) { ; RV32-LABEL: insertelt_v8i64_store: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: li a2, -1 -; RV32-NEXT: vmv.s.x v12, a2 -; RV32-NEXT: addi a2, a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, ma -; RV32-NEXT: vslideup.vx v8, v12, a1 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, -1 +; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v8i64_store: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: li a2, -1 -; RV64-NEXT: vmv.s.x v12, a2 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: addi a2, a1, 1 -; RV64-NEXT: vsetvli zero, a2, e64, m4, tu, ma -; RV64-NEXT: vslideup.vx v8, v12, a1 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: srli a1, a1, 29 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, -1 +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret %a = load <8 x i64>, ptr %x %b = insertelement <8 x i64> %a, i64 -1, i32 %idx @@ -364,15 +306,18 @@ } define void @insertelt_c6_v8i64_0_store(ptr %x) { -; CHECK-LABEL: insertelt_c6_v8i64_0_store: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma -; CHECK-NEXT: vmv.s.x v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: insertelt_c6_v8i64_0_store: +; RV32: # %bb.0: +; RV32-NEXT: sw zero, 4(a0) +; RV32-NEXT: li a1, 6 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_c6_v8i64_0_store: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 6 +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: ret %a = load <8 x i64>, ptr %x %b = insertelement <8 x i64> %a, i64 6, i32 0 store <8 x i64> %b, ptr %x @@ -408,30 +353,20 @@ define void @insertelt_c6_v8i64_store(ptr %x, i32 %idx) { ; RV32-LABEL: insertelt_c6_v8i64_store: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: li a2, 6 -; RV32-NEXT: vmv.s.x v12, a2 -; RV32-NEXT: addi a2, a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, ma -; RV32-NEXT: vslideup.vx v8, v12, a1 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sw zero, 4(a0) +; RV32-NEXT: li a1, 6 +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_c6_v8i64_store: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: li a2, 6 -; RV64-NEXT: vmv.s.x v12, a2 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: addi a2, a1, 1 -; RV64-NEXT: vsetvli zero, a2, e64, m4, tu, ma -; RV64-NEXT: vslideup.vx v8, v12, a1 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: srli a1, a1, 29 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 6 +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret %a = load <8 x i64>, ptr %x %b = insertelement <8 x i64> %a, i64 6, i32 %idx diff --git a/llvm/test/CodeGen/X86/pr47874.ll b/llvm/test/CodeGen/X86/pr47874.ll --- a/llvm/test/CodeGen/X86/pr47874.ll +++ b/llvm/test/CodeGen/X86/pr47874.ll @@ -140,17 +140,16 @@ ; SSE2-NEXT: testl %edx, %edx ; SSE2-NEXT: jle LBB2_3 ; SSE2-NEXT: ## %bb.1: ## %bb4 -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: LBB2_2: ## %bb8 ; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: ## InlineAsm Start ; SSE2-NEXT: ## InlineAsm End -; SSE2-NEXT: movaps (%rdi), %xmm0 -; SSE2-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: movaps %xmm0, (%rdi) +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload +; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: decq %rax ; SSE2-NEXT: jne LBB2_2 @@ -162,17 +161,17 @@ ; AVX-NEXT: testl %edx, %edx ; AVX-NEXT: jle LBB2_3 ; AVX-NEXT: ## %bb.1: ## %bb4 -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; AVX-NEXT: movl %edx, %eax ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: LBB2_2: ## %bb8 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1 ; AVX-NEXT: ## InlineAsm Start ; AVX-NEXT: ## InlineAsm End -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; AVX-NEXT: vmovaps %xmm0, (%rdi) +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; AVX-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%rdi) ; AVX-NEXT: addq $16, %rdi ; AVX-NEXT: decq %rax ; AVX-NEXT: jne LBB2_2 @@ -210,16 +209,15 @@ ; SSE2-NEXT: testq %rdx, %rdx ; SSE2-NEXT: jle LBB3_3 ; SSE2-NEXT: ## %bb.1: ## %bb3 -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: LBB3_2: ## %bb6 ; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: ## InlineAsm Start ; SSE2-NEXT: ## InlineAsm End -; SSE2-NEXT: movapd (%rdi), %xmm0 -; SSE2-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: movapd %xmm0, (%rdi) +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Folded Reload +; SSE2-NEXT: movsd %xmm0, (%rdi) ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: decq %rdx ; SSE2-NEXT: jne LBB3_2 @@ -231,16 +229,16 @@ ; AVX-NEXT: testq %rdx, %rdx ; AVX-NEXT: jle LBB3_3 ; AVX-NEXT: ## %bb.1: ## %bb3 -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: LBB3_2: ## %bb6 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1 ; AVX-NEXT: ## InlineAsm Start ; AVX-NEXT: ## InlineAsm End -; AVX-NEXT: vmovapd (%rdi), %xmm0 -; AVX-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; AVX-NEXT: vmovapd %xmm0, (%rdi) +; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload +; AVX-NEXT: ## xmm0 = mem[0],zero +; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovsd %xmm0, (%rdi) ; AVX-NEXT: addq $16, %rdi ; AVX-NEXT: decq %rdx ; AVX-NEXT: jne LBB3_2 diff --git a/llvm/test/CodeGen/X86/pr59980.ll b/llvm/test/CodeGen/X86/pr59980.ll --- a/llvm/test/CodeGen/X86/pr59980.ll +++ b/llvm/test/CodeGen/X86/pr59980.ll @@ -7,21 +7,9 @@ define void @foo(ptr %0, ptr %1, ptr %2) #0 { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: andq $-32, %rsp -; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: movl (%rdx), %eax -; CHECK-NEXT: andl $15, %eax ; CHECK-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vmovups (%rsi), %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rsp) -; CHECK-NEXT: vpextrw $0, %xmm0, (%rsp,%rax,2) -; CHECK-NEXT: vmovaps (%rsp), %ymm0 -; CHECK-NEXT: vmovups %ymm0, (%rsi) -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpextrw $0, %xmm0, (%rsi,%rax,2) ; CHECK-NEXT: retq %4 = bitcast ptr %2 to ptr %5 = load i64, ptr %4, align 8 diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll --- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll @@ -59,19 +59,17 @@ ; X86-LABEL: t3: ; X86: ## %bb.0: ; X86-NEXT: movl L_g0$non_lazy_ptr, %eax +; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: movl L_g1$non_lazy_ptr, %ecx -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: pinsrw $0, (%eax), %xmm0 -; X86-NEXT: movq %xmm0, (%ecx) +; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: t3: ; X64: ## %bb.0: ; X64-NEXT: movq _g0@GOTPCREL(%rip), %rax +; X64-NEXT: movzwl (%rax), %eax ; X64-NEXT: movq _g1@GOTPCREL(%rip), %rcx -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pinsrw $0, (%rax), %xmm0 -; X64-NEXT: movq %xmm0, (%rcx) +; X64-NEXT: movw %ax, (%rcx) ; X64-NEXT: retq load i16, ptr @g0 load <4 x i16>, ptr @g1