Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4490,18 +4490,21 @@ // avoid a false dependency on any previous contents of the vector // register. - // Use a VLREP if at least one element is a load. - unsigned LoadElIdx = UINT_MAX; + // Use a VLREP if at least one element is a load. Make sure to replicate + // the load with the most elements having its value. + std::map UseCounts; + SDNode *LoadMaxUses = nullptr; for (unsigned I = 0; I < NumElements; ++I) if (Elems[I].getOpcode() == ISD::LOAD && cast(Elems[I])->isUnindexed()) { - LoadElIdx = I; - break; + SDNode *Ld = Elems[I].getNode(); + UseCounts[Ld]++; + if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld]) + LoadMaxUses = Ld; } - if (LoadElIdx != UINT_MAX) { - Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, Elems[LoadElIdx]); - Done[LoadElIdx] = true; - ReplicatedVal = Elems[LoadElIdx]; + if (LoadMaxUses != nullptr) { + ReplicatedVal = SDValue(LoadMaxUses, 0); + Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal); } else { // Try to use VLVGP. unsigned I1 = NumElements / 2 - 1; Index: test/CodeGen/SystemZ/vec-move-20.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/vec-move-20.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a vector which is built with elements from two loads replicates the +; load with most elements having its value. + +; CHECK: vlef +; CHECK-NOT: vlvgf + +define void @update(i32* %src1, i32* %src2, <4 x i32>* %dst) { +bb: + %tmp = load i32, i32* %src1 + %tmp1 = load i32, i32* %src2 + %tmp2 = insertelement <4 x i32> undef, i32 %tmp, i32 0 + %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1 + %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2 + %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3 + store <4 x i32> %tmp5, <4 x i32>* %dst + ret void +}