Index: lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.h +++ lib/Target/SystemZ/SystemZISelLowering.h @@ -587,6 +587,7 @@ SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineLOAD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -523,6 +523,7 @@ setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::FP_ROUND); @@ -5368,6 +5369,46 @@ return SDValue(); } +SDValue SystemZTargetLowering::combineLOAD( + SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT LdVT = N->getValueType(0); + if (LdVT.isVector() || LdVT.isInteger()) + return SDValue(); + // Transform a scalar load that is REPLICATEd as well as having other + // use(s) to the form where the other use(s) use the first element of the + // REPLICATE instead of the load. Otherwise instruction selection will not + // produce a VLREP. Avoid extracting to a GPR, so only do this for floating + // point loads. + + SDValue Replicate; + SmallVector OtherUses; + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() == SystemZISD::REPLICATE) { + if (Replicate) + return SDValue(); // Should never happen + Replicate = SDValue(*UI,0); + } + else if (UI.getUse().getResNo() == 0) + OtherUses.push_back(*UI); + } + if (!Replicate || OtherUses.empty()) + return SDValue(); + + SDLoc DL(N); + SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT, + Replicate, DAG.getConstant(0, DL, MVT::i32)); + // Update uses of the loaded Value while preserving old chains. + for (SDNode *U : OtherUses) { + SmallVector Ops; + for (SDValue Op : U->ops()) + Ops.push_back((Op.getNode() == N && Op.getResNo() == 0) ? Extract0 : Op); + DAG.UpdateNodeOperands(U, Ops); + } + return SDValue(N, 0); +} + SDValue SystemZTargetLowering::combineSTORE( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -5699,6 +5740,7 @@ case ISD::SIGN_EXTEND_INREG: return combineSIGN_EXTEND_INREG(N, DCI); case SystemZISD::MERGE_HIGH: case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI); + case ISD::LOAD: return combineLOAD(N, DCI); case ISD::STORE: return combineSTORE(N, DCI); case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI); case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI); Index: test/CodeGen/SystemZ/vec-move-21.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/vec-move-21.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test that a replicate of a load gets folded to vlrep also in cases where +; the load has multiple users. + +; CHECK-NOT: vrep + + +define double @fun(double* %Vsrc, <2 x double> %T) { +entry: + %Vgep1 = getelementptr double, double* %Vsrc, i64 0 + %Vld1 = load double, double* %Vgep1 + %Vgep2 = getelementptr double, double* %Vsrc, i64 1 + %Vld2 = load double, double* %Vgep2 + %Vgep3 = getelementptr double, double* %Vsrc, i64 2 + %Vld3 = load double, double* %Vgep3 + %Vgep4 = getelementptr double, double* %Vsrc, i64 3 + %Vld4 = load double, double* %Vgep4 + %Vgep5 = getelementptr double, double* %Vsrc, i64 4 + %Vld5 = load double, double* %Vgep5 + %Vgep6 = getelementptr double, double* %Vsrc, i64 5 + %Vld6 = load double, double* %Vgep6 + + %V19 = insertelement <2 x double> undef, double %Vld1, i32 0 + %V20 = shufflevector <2 x double> %V19, <2 x double> undef, <2 x i32> zeroinitializer + %V21 = insertelement <2 x double> undef, double %Vld4, i32 0 + %V22 = insertelement <2 x double> %V21, double %Vld5, i32 1 + %V23 = fmul <2 x double> %V20, %V22 + %V24 = fadd <2 x double> %T, %V23 + %V25 = insertelement <2 x double> %V19, double %Vld2, i32 1 + %V26 = insertelement <2 x double> undef, double %Vld6, i32 0 + %V27 = insertelement <2 x double> %V26, double %Vld6, i32 1 + %V28 = fmul <2 x double> %V25, %V27 + %V29 = fadd <2 x double> %T, %V28 + %V30 = insertelement <2 x double> undef, double %Vld2, i32 0 + %V31 = shufflevector <2 x double> %V30, <2 x double> undef, <2 x i32> zeroinitializer + %V32 = insertelement <2 x double> undef, double %Vld5, i32 0 + %V33 = insertelement <2 x double> %V32, double %Vld6, i32 1 + %V34 = fmul <2 x double> %V31, %V33 + %V35 = fadd <2 x double> %T, %V34 + %V36 = insertelement <2 x double> undef, double %Vld3, i32 0 + %V37 = shufflevector <2 x double> %V36, <2 x double> undef, <2 x i32> zeroinitializer + %V38 = fmul <2 x double> %V37, %V22 + %V39 = fadd <2 x double> %T, %V38 + %Vmul37 = fmul double %Vld3, %Vld6 + %Vadd38 = fadd double %Vmul37, %Vmul37 + + %VA0 = fadd <2 x double> %V24, %V29 + %VA1 = fadd <2 x double> %VA0, %V35 + %VA2 = fadd <2 x double> %VA1, %V39 + + %VE0 = extractelement <2 x double> %VA2, i32 0 + %VS1 = fadd double %VE0, %Vadd38 + + ret double %VS1 +} Index: test/CodeGen/SystemZ/vec-move-22.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/vec-move-22.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test that a loaded value which is used both in a vector and scalar context +; is not transformed to a vlrep + vlgvg. + +; CHECK-NOT: vlrep + +define void @fun(i64 %arg, i64** %Addr, <2 x i64*>* %Dst) { + %tmp10 = load i64*, i64** %Addr + store i64 %arg, i64* %tmp10 + %tmp12 = insertelement <2 x i64*> undef, i64* %tmp10, i32 0 + %tmp13 = insertelement <2 x i64*> %tmp12, i64* %tmp10, i32 1 + store <2 x i64*> %tmp13, <2 x i64*>* %Dst + ret void +}