Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -137,6 +137,12 @@ /// Direct move from a GPR to a VSX register (zero) MTVSRZ, + /// Extract a subvector from signed integer vector and convert to FP + SINT_VEC_TO_VEC, + + /// Extract a subvector from unsigned integer vector and convert to FP + UINT_VEC_TO_VEC, + // FIXME: Remove these once the ANDI glue bug is fixed: /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the /// eq or gt bit of CR0 after executing andi. x, 1. This is used to @@ -432,6 +438,15 @@ /// DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + /// LegalizeTypeAction - The code we generate when illegal vector types are + /// legalized by promoting the integer element type is much worse than code + /// we generate if we widen the type for applicable vectory types. + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) + const override { + if (VT.getVectorElementType().getSizeInBits() % 8 == 0) + return TypeWidenVector; + return TargetLoweringBase::getPreferredVectorAction(VT); + } bool useSoftFloat() const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { @@ -897,6 +912,7 @@ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const; SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -851,6 +851,7 @@ // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::LOAD); @@ -1042,6 +1043,8 @@ case PPCISD::MFVSR: return "PPCISD::MFVSR"; case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; + case PPCISD::SINT_VEC_TO_VEC: return "PPCISD::SINT_VEC_TO_VEC"; + case PPCISD::UINT_VEC_TO_VEC: return "PPCISD::UINT_VEC_TO_VEC"; case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; case PPCISD::VCMP: return "PPCISD::VCMP"; @@ -10270,6 +10273,54 @@ ShiftCst); } +SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, + DAGCombinerInfo &DCI) const { + assert(N->getOpcode() == ISD::BUILD_VECTOR && + "Should be called with a BUILD_VECTOR node"); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + if (N->getValueType(0) != MVT::v2f64 || !Subtarget.hasVSX()) + return SDValue(); + + // Looking for: + // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1)) + if (N->getOperand(0).getOpcode() != ISD::SINT_TO_FP && + N->getOperand(0).getOpcode() != ISD::UINT_TO_FP) + return SDValue(); + if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP && + N->getOperand(1).getOpcode() != ISD::UINT_TO_FP) + return SDValue(); + if (N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode()) + return SDValue(); + + SDValue Ext1 = N->getOperand(0).getOperand(0); + SDValue Ext2 = N->getOperand(1).getOperand(0); + if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + if (Ext1.getValueType() != MVT::i32 || + Ext2.getValueType() != MVT::i32) + if(Ext1.getOperand(0) != Ext2.getOperand(0)) + return SDValue(); + + SDValue SrcVec = Ext1.getOperand(0); + int FirstElem = Ext1.getConstantOperandVal(1); + int SecondElem = Ext2.getConstantOperandVal(1); + int SubvecIdx; + if (FirstElem == 0 && SecondElem == 1) + SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0; + else if (FirstElem == 2 && SecondElem == 3) + SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1; + else + return SDValue(); + + auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ? + PPCISD::SINT_VEC_TO_VEC : PPCISD::UINT_VEC_TO_VEC; + return DAG.getNode(NodeType, dl, MVT::v2f64, + SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl)); +} + SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const { assert((N->getOpcode() == ISD::SINT_TO_FP || @@ -11106,6 +11157,8 @@ } break; } + case ISD::BUILD_VECTOR: + return DAGCombineBuildVector(N, DCI); } return SDValue(); Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -57,6 +57,9 @@ def SDT_PPCxxswapd : SDTypeProfile<1, 1, [ SDTCisSameAs<0, 1> ]>; +def SDTVecConv : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2> +]>; def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, [SDNPHasChain, SDNPMayLoad]>; @@ -66,6 +69,8 @@ def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>; def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>; def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>; +def PPCSV2V : SDNode<"PPCISD::SINT_VEC_TO_VEC", SDTVecConv, []>; +def PPCUV2V : SDNode<"PPCISD::UINT_VEC_TO_VEC", SDTVecConv, []>; multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, @@ -608,7 +613,8 @@ "xvcvsxwdp $XT, $XB", IIC_VecFP, []>; def XVCVSXWSP : XX2Form<60, 184, (outs vsrc:$XT), (ins vsrc:$XB), - "xvcvsxwsp $XT, $XB", IIC_VecFP, []>; + "xvcvsxwsp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>; def XVCVUXDDP : XX2Form<60, 488, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxddp $XT, $XB", IIC_VecFP, @@ -928,6 +934,16 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))), (XVCVSXWDP (XXSLDWI $C, $C, 1))>; +def : Pat<(v2f64 (PPCSV2V v4i32:$C, 0)), + (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>; +def : Pat<(v2f64 (PPCSV2V v4i32:$C, 1)), + (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>; + +def : Pat<(v2f64 (PPCUV2V v4i32:$C, 0)), + (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>; +def : Pat<(v2f64 (PPCUV2V v4i32:$C, 1)), + (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>; + // Loads. def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; Index: test/Analysis/CostModel/PowerPC/load_store.ll =================================================================== --- test/Analysis/CostModel/PowerPC/load_store.ll +++ test/Analysis/CostModel/PowerPC/load_store.ll @@ -31,7 +31,7 @@ ; FIXME: There actually are sub-vector Altivec loads, and so we could handle ; this with a small expense, but we don't currently. - ; CHECK: cost of 48 {{.*}} load + ; CHECK: cost of 42 {{.*}} load load <4 x i16>, <4 x i16>* undef, align 2 ; CHECK: cost of 2 {{.*}} load Index: test/CodeGen/PowerPC/load-v4i8-improved.ll =================================================================== --- test/CodeGen/PowerPC/load-v4i8-improved.ll +++ test/CodeGen/PowerPC/load-v4i8-improved.ll @@ -0,0 +1,25 @@ +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \ +; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck \ +; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s \ +; RUN: --check-prefix=CHECK-BE + +define <16 x i8> @test(i32* %s, i32* %t) { +entry: + %0 = bitcast i32* %s to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 4 + %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> + ret <16 x i8> %2 +; CHECK: lwz [[GPR:[0-9]+]], 0(3) +; CHECK: mtvsrd [[VSR:[0-9]+]], [[GPR]] +; CHECK: xxswapd [[SWP:[0-9]+]], [[VSR]] +; CHECK: xxspltw 34, [[SWP]], 3 +; CHECK-NOT: vmrg +; CHECK-NOT: vperm +; CHECK-BE: lwz [[GPR:[0-9]+]], 0(3) +; CHECK-BE: sldi [[SHL:[0-9]+]], [[GPR]], 32 +; CHECK-BE: mtvsrd [[VSR:[0-9]+]], [[SHL]] +; CHECK-BE: xxspltw 34, [[VSR]], 0 +; CHECK-BE-NOT: vmrg +; CHECK-BE-NOT: vperm +} Index: test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll =================================================================== --- test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll +++ test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll @@ -9,7 +9,8 @@ ret <2 x i32> %strided.vec ; CHECK-LABEL: @test1 -; CHECK: vsldoi 2, 2, 2, 12 +; CHECK: vsldoi [[TGT:[0-9]+]], 2, 2, 8 +; CHECK: vmrghw 2, 2, [[TGT]] ; CHECK: blr } Index: test/CodeGen/PowerPC/vec_cmp.ll =================================================================== --- test/CodeGen/PowerPC/vec_cmp.ll +++ test/CodeGen/PowerPC/vec_cmp.ll @@ -24,7 +24,7 @@ ret <4 x i8> %sext } ; CHECK-LABEL: v4si8_cmp: -; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone { @@ -33,7 +33,7 @@ ret <8 x i8> %sext } ; CHECK-LABEL: v8si8_cmp: -; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} ; Additional tests for v16i8 since it is a altivec native type @@ -158,7 +158,7 @@ ret <4 x i16> %sext } ; CHECK-LABEL: v4si16_cmp: -; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} ; Additional tests for v8i16 since it is an altivec native type Index: test/CodeGen/PowerPC/vsx.ll =================================================================== --- test/CodeGen/PowerPC/vsx.ll +++ test/CodeGen/PowerPC/vsx.ll @@ -1144,62 +1144,67 @@ ret <2 x double> %w ; CHECK-LABEL: @test68 -; CHECK: xxsldwi [[V1:[0-9]+]], 34, 34, 1 +; CHECK: xxmrghw [[V1:[0-9]+]] ; CHECK: xvcvsxwdp 34, [[V1]] ; CHECK: blr ; CHECK-LE-LABEL: @test68 -; CHECK-LE: xxsldwi [[V1:[0-9]+]], 34, 34, 1 +; CHECK-LE: xxmrglw [[V1:[0-9]+]], 34, 34 ; CHECK-LE: xvcvsxwdp 34, [[V1]] ; CHECK-LE: blr } +; This gets scalarized so the code isn't great define <2 x double> @test69(<2 x i16> %a) { %w = sitofp <2 x i16> %a to <2 x double> ret <2 x double> %w ; CHECK-LABEL: @test69 -; CHECK: vspltisw [[V1:[0-9]+]], 8 -; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]] -; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]] -; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]] -; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1 -; CHECK: xvcvsxwdp 34, [[V4]] +; CHECK-DAG: lfiwax +; CHECK-DAG: lfiwax +; CHECK-DAG: xscvsxddp +; CHECK-DAG: xscvsxddp +; CHECK: xxmrghd ; CHECK: blr ; CHECK-LE-LABEL: @test69 -; CHECK-LE: vspltisw [[V1:[0-9]+]], 8 -; CHECK-LE: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]] -; CHECK-LE: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]] -; CHECK-LE: vsraw {{[0-9]+}}, [[V3]], [[V2]] -; CHECK-LE: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1 -; CHECK-LE: xvcvsxwdp 34, [[V4]] +; CHECK-LE: mfvsrd +; CHECK-LE: mtvsrwa +; CHECK-LE: mtvsrwa +; CHECK-LE: xscvsxddp +; CHECK-LE: xscvsxddp +; CHECK-LE: xxspltd +; CHECK-LE: xxspltd +; CHECK-LE: xxmrgld ; CHECK-LE: blr } +; This gets scalarized so the code isn't great define <2 x double> @test70(<2 x i8> %a) { %w = sitofp <2 x i8> %a to <2 x double> ret <2 x double> %w ; CHECK-LABEL: @test70 -; CHECK: vspltisw [[V1:[0-9]+]], 12 -; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]] -; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]] -; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]] -; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1 -; CHECK: xvcvsxwdp 34, [[V4]] +; CHECK-DAG: lfiwax +; CHECK-DAG: lfiwax +; CHECK-DAG: xscvsxddp +; CHECK-DAG: xscvsxddp +; CHECK: xxmrghd ; CHECK: blr ; CHECK-LE-LABEL: @test70 -; CHECK-LE: vspltisw [[V1:[0-9]+]], 12 -; CHECK-LE: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]] -; CHECK-LE: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]] -; CHECK-LE: vsraw {{[0-9]+}}, [[V3]], [[V2]] -; CHECK-LE: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1 -; CHECK-LE: xvcvsxwdp 34, [[V4]] +; CHECK-LE: mfvsrd +; CHECK-LE: mtvsrwa +; CHECK-LE: mtvsrwa +; CHECK-LE: xscvsxddp +; CHECK-LE: xscvsxddp +; CHECK-LE: xxspltd +; CHECK-LE: xxspltd +; CHECK-LE: xxmrgld ; CHECK-LE: blr } +; This gets scalarized so the code isn't great define <2 x i32> @test80(i32 %v) { %b1 = insertelement <2 x i32> undef, i32 %v, i32 0 %b2 = shufflevector <2 x i32> %b1, <2 x i32> undef, <2 x i32> zeroinitializer @@ -1207,31 +1212,38 @@ ret <2 x i32> %i ; CHECK-REG-LABEL: @test80 -; CHECK-REG-DAG: addi [[R1:[0-9]+]], 3, 3 -; CHECK-REG-DAG: addi [[R2:[0-9]+]], 1, -16 -; CHECK-REG-DAG: addi [[R3:[0-9]+]], 3, 2 -; CHECK-REG: std [[R1]], -8(1) -; CHECK-REG: std [[R3]], -16(1) -; CHECK-REG: lxvd2x 34, 0, [[R2]] -; CHECK-REG-NOT: stxvd2x +; CHECK-REG: stw 3, -16(1) +; CHECK-REG: addi [[R1:[0-9]+]], 1, -16 +; CHECK-REG: addis [[R2:[0-9]+]] +; CHECK-REG: addi [[R2]], [[R2]] +; CHECK-REG-DAG: lxvw4x [[VS1:[0-9]+]], 0, [[R1]] +; CHECK-REG-DAG: lxvw4x 35, 0, [[R2]] +; CHECK-REG: xxspltw 34, [[VS1]], 0 +; CHECK-REG: vadduwm 2, 2, 3 +; CHECK-REG-NOT: stxvw4x ; CHECK-REG: blr ; CHECK-FISL-LABEL: @test80 -; CHECK-FISL-DAG: addi [[R1:[0-9]+]], 3, 3 -; CHECK-FISL-DAG: addi [[R2:[0-9]+]], 1, -16 -; CHECK-FISL-DAG: addi [[R3:[0-9]+]], 3, 2 -; CHECK-FISL-DAG: std [[R1]], -8(1) -; CHECK-FISL-DAG: std [[R3]], -16(1) -; CHECK-FISL-DAG: lxvd2x 0, 0, [[R2]] +; CHECK-FISL: mr 4, 3 +; CHECK-FISL: stw 4, -16(1) +; CHECK-FISL: addi [[R1:[0-9]+]], 1, -16 +; CHECK-FISL-DAG: lxvw4x [[VS1:[0-9]+]], 0, [[R1]] +; CHECK-FISL-DAG: xxspltw {{[0-9]+}}, [[VS1]], 0 +; CHECK-FISL: addis [[R2:[0-9]+]] +; CHECK-FISL: addi [[R2]], [[R2]] +; CHECK-FISL-DAG: lxvw4x {{[0-9]+}}, 0, [[R2]] +; CHECK-FISL: vadduwm +; CHECK-FISL-NOT: stxvw4x ; CHECK-FISL: blr ; CHECK-LE-LABEL: @test80 ; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3 +; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]] ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI ; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]] -; CHECK-LE-DAG: xxspltd 34, [[R1]] +; CHECK-LE-DAG: xxspltw 34, [[V1]] ; CHECK-LE-DAG: xxswapd 35, [[V2]] -; CHECK-LE: vaddudm 2, 2, 3 +; CHECK-LE: vadduwm 2, 2, 3 ; CHECK-LE: blr }