Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -36,6 +36,7 @@ PPCTLSDynamicCall.cpp PPCVSXCopy.cpp PPCVSXFMAMutate.cpp + PPCVSXSwapRemoval.cpp ) add_subdirectory(AsmParser) Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -39,6 +39,7 @@ FunctionPass *createPPCEarlyReturnPass(); FunctionPass *createPPCVSXCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); + FunctionPass *createPPCVSXSwapRemovalPass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); Index: lib/Target/PowerPC/PPC.td =================================================================== --- lib/Target/PowerPC/PPC.td +++ lib/Target/PowerPC/PPC.td @@ -192,6 +192,14 @@ bit IsVSXFMAAlt = 0; } +// LaneSensitive - Filter class used to identify instructions that are +// sensitive to which lanes contain which values. That is, they are not +// "pure SIMD" instructions where the lanes are fully independent. This +// is used by the VSX swap removal optimization. +class LaneSensitive { + bit SwapFlag = 1; +} + //===----------------------------------------------------------------------===// // Relation Map Definitions. //===----------------------------------------------------------------------===// @@ -235,6 +243,23 @@ let ValueCols = [["1"]]; } +// This is a slight misuse of InstrMapping because we don't +// really have a better way to flag instructions as belonging to +// a set. This is a simple mapping of instructions to themselves, +// to be applied to those instructions that are lane-sensitive +// (see the definition of LaneSensitive, above). +def opcodeIsLaneSensitive : InstrMapping { + let FilterClass = "LaneSensitive"; + // Instructions with the same BaseName values form a row. + let RowFields = ["BaseName"]; + // Instructions with the same SwapFlag values form a column. + let ColFields = ["SwapFlag"]; + // The key column is the SwapFlag. + let KeyCol = ["1"]; + // The value column is also the SwapFlag. + let ValueCols = [["1"]]; +} + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// Index: lib/Target/PowerPC/PPCInstrAltivec.td =================================================================== --- lib/Target/PowerPC/PPCInstrAltivec.td +++ lib/Target/PowerPC/PPCInstrAltivec.td @@ -353,48 +353,66 @@ [(int_ppc_altivec_mtvscr v4i32:$vB)]>; let PPC970_Unit = 2 in { // Loads. +let BaseName = "LVEBX" in def LVEBX: XForm_1<31, 7, (outs vrrc:$vD), (ins memrr:$src), "lvebx $vD, $src", IIC_LdStLoad, - [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>; + [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>, + LaneSensitive; +let BaseName = "LVEHX" in def LVEHX: XForm_1<31, 39, (outs vrrc:$vD), (ins memrr:$src), "lvehx $vD, $src", IIC_LdStLoad, - [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>; + [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>, + LaneSensitive; +let BaseName = "LVEWX" in def LVEWX: XForm_1<31, 71, (outs vrrc:$vD), (ins memrr:$src), "lvewx $vD, $src", IIC_LdStLoad, - [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>; + [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>, + LaneSensitive; def LVX : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src), "lvx $vD, $src", IIC_LdStLoad, [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>; +let BaseName = "LVXL" in def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src), "lvxl $vD, $src", IIC_LdStLoad, - [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>; + [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>, + LaneSensitive; } +let BaseName = "LVSL" in def LVSL : XForm_1<31, 6, (outs vrrc:$vD), (ins memrr:$src), "lvsl $vD, $src", IIC_LdStLoad, [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>, - PPC970_Unit_LSU; + PPC970_Unit_LSU, LaneSensitive; +let BaseName = "LVSR" in def LVSR : XForm_1<31, 38, (outs vrrc:$vD), (ins memrr:$src), "lvsr $vD, $src", IIC_LdStLoad, [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>, - PPC970_Unit_LSU; + PPC970_Unit_LSU, LaneSensitive; let PPC970_Unit = 2 in { // Stores. +let BaseName = "STVEBX" in def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst), "stvebx $rS, $dst", IIC_LdStStore, - [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>; + [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>, + LaneSensitive; +let BaseName = "STVEHX" in def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst), "stvehx $rS, $dst", IIC_LdStStore, - [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>; + [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>, + LaneSensitive; +let BaseName = "STVEWX" in def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst), "stvewx $rS, $dst", IIC_LdStStore, - [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>; + [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>, + LaneSensitive; def STVX : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst), "stvx $rS, $dst", IIC_LdStStore, [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>; +let BaseName = "STVXL" in def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst), "stvxl $rS, $dst", IIC_LdStStore, - [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>; + [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>, + LaneSensitive; } let PPC970_Unit = 5 in { // VALU Operations. @@ -417,15 +435,18 @@ def VMLADDUHM : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>; } // isCommutable +let BaseName = "VPERM" in def VPERM : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm, - v4i32, v4i32, v16i8>; + v4i32, v4i32, v16i8>, LaneSensitive; def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>; // Shuffles. +let BaseName = "VSLDOI" in def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH), "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP, [(set v16i8:$vD, - (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>; + (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; // VX-Form instructions. AltiVec arithmetic ops. let isCommutable = 1 in { @@ -526,24 +547,42 @@ def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>; } // isCommutable +let BaseName = "VMRGHB" in def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmrghb $vD, $vA, $vB", IIC_VecFP, - [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; +let BaseName = "VMRGHH" in def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmrghh $vD, $vA, $vB", IIC_VecFP, - [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; +let BaseName = "VMRGHW" in def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmrghw $vD, $vA, $vB", IIC_VecFP, - [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; +let BaseName = "VMRGLB" in def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmrglb $vD, $vA, $vB", IIC_VecFP, - [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; +let BaseName = "VMRGLH" in def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmrglh $vD, $vA, $vB", IIC_VecFP, - [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; +let BaseName = "VMRGLW" in def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmrglw $vD, $vA, $vB", IIC_VecFP, - [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm, v4i32, v16i8, v4i32>; @@ -559,22 +598,30 @@ v4i32, v8i16, v4i32>; let isCommutable = 1 in { +let BaseName = "VMULESB" in def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb, - v8i16, v16i8>; + v8i16, v16i8>, LaneSensitive; +let BaseName = "VMULESH" in def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; +let BaseName = "VMULESU" in def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub, - v8i16, v16i8>; + v8i16, v16i8>, LaneSensitive; +let BaseName = "VMULEUH" in def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; +let BaseName = "VMULOSB" in def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb, - v8i16, v16i8>; + v8i16, v16i8>, LaneSensitive; +let BaseName = "VMULOSH" in def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; +let BaseName = "VMULOUB" in def VMULOUB : VX1_Int_Ty2< 8, "vmuloub", int_ppc_altivec_vmuloub, - v8i16, v16i8>; + v8i16, v16i8>, LaneSensitive; +let BaseName = "VMULOUH" in def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; } // isCommutable def VREFP : VX2_Int_SP<266, "vrefp", int_ppc_altivec_vrefp>; @@ -606,15 +653,22 @@ def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>; def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>; -def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>; -def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>; +let BaseName = "VSUMSWS" in +def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>, + LaneSensitive; +let BaseName = "VSUM2SWS" in +def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>, + LaneSensitive; +let BaseName = "VSUM4SBS" in def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs, - v4i32, v16i8, v4i32>; + v4i32, v16i8, v4i32>, LaneSensitive; +let BaseName = "VSUM4SHS" in def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs, - v4i32, v8i16, v4i32>; + v4i32, v8i16, v4i32>, LaneSensitive; +let BaseName = "VSUM4UBS" in def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs, - v4i32, v16i8, v4i32>; + v4i32, v16i8, v4i32>, LaneSensitive; def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vnor $vD, $vA, $vB", IIC_VecFP, @@ -629,12 +683,22 @@ [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>; } // isCommutable -def VRLB : VX1_Int_Ty< 4, "vrlb", int_ppc_altivec_vrlb, v16i8>; -def VRLH : VX1_Int_Ty< 68, "vrlh", int_ppc_altivec_vrlh, v8i16>; -def VRLW : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>; - -def VSL : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl, v4i32 >; -def VSLO : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>; +let BaseName = "VRLB" in +def VRLB : VX1_Int_Ty< 4, "vrlb", int_ppc_altivec_vrlb, v16i8>, + LaneSensitive; +let BaseName = "VRLH" in +def VRLH : VX1_Int_Ty< 68, "vrlh", int_ppc_altivec_vrlh, v8i16>, + LaneSensitive; +let BaseName = "VRLW" in +def VRLW : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>, + LaneSensitive; + +let BaseName = "VSL" in +def VSL : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl, v4i32>, + LaneSensitive; +let BaseName = "VSLO" in +def VSLO : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>, + LaneSensitive; def VSLB : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>; def VSLH : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>; @@ -653,8 +717,12 @@ [(set v16i8:$vD, (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>; -def VSR : VX1_Int_Ty< 708, "vsr" , int_ppc_altivec_vsr, v4i32>; -def VSRO : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>; +let BaseName = "VSR" in +def VSR : VX1_Int_Ty< 708, "vsr" , int_ppc_altivec_vsr, v4i32>, + LaneSensitive; +let BaseName = "VSRO" in +def VSRO : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>, + LaneSensitive; def VSRAB : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>; def VSRAH : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>; @@ -675,42 +743,59 @@ [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>; // Vector Pack. +let BaseName = "VPKPX" in def VPKPX : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx, - v8i16, v4i32>; + v8i16, v4i32>, LaneSensitive; +let BaseName = "VPKSHSS" in def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss, - v16i8, v8i16>; + v16i8, v8i16>, LaneSensitive; +let BaseName = "VPKSHUS" in def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus, - v16i8, v8i16>; + v16i8, v8i16>, LaneSensitive; +let BaseName = "VPKSWSS" in def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss, - v16i8, v4i32>; + v16i8, v4i32>, LaneSensitive; +let BaseName = "VPKSWUS" in def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus, - v8i16, v4i32>; + v8i16, v4i32>, LaneSensitive; +let BaseName = "VPKUHUM" in def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpkuhum $vD, $vA, $vB", IIC_VecFP, [(set v16i8:$vD, - (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>; + (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; +let BaseName = "VPKUHUS" in def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus, - v16i8, v8i16>; + v16i8, v8i16>, LaneSensitive; +let BaseName = "VPKUWUM" in def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpkuwum $vD, $vA, $vB", IIC_VecFP, [(set v16i8:$vD, - (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>; + (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>, + LaneSensitive; +let BaseName = "VPKUWUS" in def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus, - v8i16, v4i32>; + v8i16, v4i32>, LaneSensitive; // Vector Unpack. +let BaseName = "VUPKHPX" in def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; +let BaseName = "VUPKHSB" in def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb, - v8i16, v16i8>; + v8i16, v16i8>, LaneSensitive; +let BaseName = "VUPKHSH" in def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; +let BaseName = "VUPKLPX" in def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; +let BaseName = "VUPKLSB" in def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb, - v8i16, v16i8>; + v8i16, v16i8>, LaneSensitive; +let BaseName = "VUPKLSH" in def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh, - v4i32, v8i16>; + v4i32, v8i16>, LaneSensitive; // Altivec Comparisons. @@ -953,14 +1038,18 @@ let Predicates = [HasP8Altivec] in { let isCommutable = 1 in { +let BaseName = "VMULESW" in def VMULESW : VX1_Int_Ty2<904, "vmulesw", int_ppc_altivec_vmulesw, - v2i64, v4i32>; + v2i64, v4i32>, LaneSensitive; +let BaseName = "VMULEUW" in def VMULEUW : VX1_Int_Ty2<648, "vmuleuw", int_ppc_altivec_vmuleuw, - v2i64, v4i32>; + v2i64, v4i32>, LaneSensitive; +let BaseName = "VMULOSW" in def VMULOSW : VX1_Int_Ty2<392, "vmulosw", int_ppc_altivec_vmulosw, - v2i64, v4i32>; + v2i64, v4i32>, LaneSensitive; +let BaseName = "VMULOUW" in def VMULOUW : VX1_Int_Ty2<136, "vmulouw", int_ppc_altivec_vmulouw, - v2i64, v4i32>; + v2i64, v4i32>, LaneSensitive; def VMULUWM : VXForm_1<137, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmuluwm $vD, $vA, $vB", IIC_VecGeneral, [(set v4i32:$vD, (mul v4i32:$vA, v4i32:$vB))]>; @@ -971,7 +1060,9 @@ } // isCommutable // Vector shifts -def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; +let BaseName = "VRLD" in +def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>, + LaneSensitive; def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vsld $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>; @@ -1054,32 +1145,55 @@ def VCMPGTUDo : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>; // The cryptography instructions that do not require Category:Vector.Crypto +let BaseName = "VPMSUMB" in def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb", - int_ppc_altivec_crypto_vpmsumb, v16i8>; + int_ppc_altivec_crypto_vpmsumb, v16i8>, + LaneSensitive; +let BaseName = "VPMSUMH" in def VPMSUMH : VX1_Int_Ty<1096, "vpmsumh", - int_ppc_altivec_crypto_vpmsumh, v8i16>; + int_ppc_altivec_crypto_vpmsumh, v8i16>, + LaneSensitive; +let BaseName = "VPMSUMW" in def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw", - int_ppc_altivec_crypto_vpmsumw, v4i32>; + int_ppc_altivec_crypto_vpmsumw, v4i32>, + LaneSensitive; +let BaseName = "VPMSUMD" in def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd", - int_ppc_altivec_crypto_vpmsumd, v2i64>; + int_ppc_altivec_crypto_vpmsumd, v2i64>, + LaneSensitive; +let BaseName = "VPERMXOR" in def VPERMXOR : VA1a_Int_Ty<45, "vpermxor", - int_ppc_altivec_crypto_vpermxor, v16i8>; + int_ppc_altivec_crypto_vpermxor, v16i8>, + LaneSensitive; } // end HasP8Altivec // Crypto instructions (from builtins) let Predicates = [HasP8Crypto] in { +let BaseName = "VSHASIGMAW" in def VSHASIGMAW : VXCR_Int_Ty<1666, "vshasigmaw", - int_ppc_altivec_crypto_vshasigmaw, v4i32>; + int_ppc_altivec_crypto_vshasigmaw, v4i32>, + LaneSensitive; +let BaseName = "VSHASIGMAD" in def VSHASIGMAD : VXCR_Int_Ty<1730, "vshasigmad", - int_ppc_altivec_crypto_vshasigmad, v2i64>; + int_ppc_altivec_crypto_vshasigmad, v2i64>, + LaneSensitive; +let BaseName = "VCIPHER" in def VCIPHER : VX1_Int_Ty<1288, "vcipher", int_ppc_altivec_crypto_vcipher, - v2i64>; + v2i64>, LaneSensitive; +let BaseName = "VCIPHERLAST" in def VCIPHERLAST : VX1_Int_Ty<1289, "vcipherlast", - int_ppc_altivec_crypto_vcipherlast, v2i64>; + int_ppc_altivec_crypto_vcipherlast, v2i64>, + LaneSensitive; +let BaseName = "VNCIPHER" in def VNCIPHER : VX1_Int_Ty<1352, "vncipher", - int_ppc_altivec_crypto_vncipher, v2i64>; + int_ppc_altivec_crypto_vncipher, v2i64>, + LaneSensitive; +let BaseName = "VNCIPHERLAST" in def VNCIPHERLAST : VX1_Int_Ty<1353, "vncipherlast", - int_ppc_altivec_crypto_vncipherlast, v2i64>; -def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>; + int_ppc_altivec_crypto_vncipherlast, v2i64>, + LaneSensitive; +let BaseName = "VSBOX" in +def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>, + LaneSensitive; } // HasP8Crypto Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -80,9 +80,11 @@ "lxvd2x $XT, $src", IIC_LdStLFD, [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>; + let BaseName = "LXVDSX" in def LXVDSX : XX1Form<31, 332, (outs vsrc:$XT), (ins memrr:$src), - "lxvdsx $XT, $src", IIC_LdStLFD, []>; + "lxvdsx $XT, $src", IIC_LdStLFD, []>, + LaneSensitive; def LXVW4X : XX1Form<31, 780, (outs vsrc:$XT), (ins memrr:$src), @@ -92,10 +94,12 @@ // Store indexed instructions let mayStore = 1 in { + let BaseName = "STXSDX" in def STXSDX : XX1Form<31, 716, (outs), (ins vsfrc:$XT, memrr:$dst), "stxsdx $XT, $dst", IIC_LdStSTFD, - [(store f64:$XT, xoaddr:$dst)]>; + [(store f64:$XT, xoaddr:$dst)]>, + LaneSensitive; def STXVD2X : XX1Form<31, 972, (outs), (ins vsrc:$XT, memrr:$dst), @@ -731,12 +735,16 @@ } // isCommutable // Permutation Instructions + let BaseName = "XXMRGHW" in def XXMRGHW : XX3Form<60, 18, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>; + "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>, + LaneSensitive; + let BaseName = "XXMRGLW" in def XXMRGLW : XX3Form<60, 50, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>; + "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>, + LaneSensitive; def XXPERMDI : XX3Form_2<60, 10, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), @@ -748,9 +756,11 @@ def XXSLDWI : XX3Form_2<60, 2, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW), "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, []>; + let BaseName = "XXSPLTW" in def XXSPLTW : XX2Form_2<60, 164, (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM), - "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>; + "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>, + LaneSensitive; } // hasSideEffects // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -38,6 +38,10 @@ VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early", cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early")); +static cl:: +opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, + cl::desc("Disable VSX Swap Removal for PPC")); + static cl::opt EnableGEPOpt("ppc-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), @@ -239,6 +243,7 @@ bool addPreISel() override; bool addILPOpts() override; bool addInstSelector() override; + void addMachineSSAOptimization() override; void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -306,6 +311,15 @@ return false; } +void PPCPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + // For little endian, remove where possible the vector swap instructions + // introduced at code generation to normalize vector element order. + if (Triple(TM->getTargetTriple()).getArch() == Triple::ppc64le && + !DisableVSXSwapRemoval) + addPass(createPPCVSXSwapRemovalPass()); +} + void PPCPassConfig::addPreRegAlloc() { initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry()); insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID, Index: lib/Target/PowerPC/PPCVSXSwapRemoval.cpp =================================================================== --- lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -0,0 +1,708 @@ +//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass analyzes vector computations and removes unnecessary +// doubleword swaps (xxswapd instructions). This pass is performed +// only for little-endian VSX code generation. +// +// For this specific case, loads and stores of v4i32, v4r32, v2i64, +// and v2f64 vectors are inefficient. These are implemented using +// the lxvd2x and stxvd2x instructions, which invert the order of +// doublewords in a vector register. Thus code generation inserts +// an xxswapd after each such load, and prior to each such store. +// +// The extra xxswapd instructions reduce performance. The purpose +// of this pass is to reduce the number of xxswapd instructions +// required for correctness. +// +// The primary insight is that much code that operates on vectors +// does not care about the relative order of elements in a register, +// so long as the correct memory order is preserved. If we have a +// computation where all input values are provided by lxvd2x/xxswapd, +// all outputs are stored using xxswapd/lxvd2x, and all intermediate +// computations are lane-insensitive (independent of element order), +// then all the xxswapd instructions associated with the loads and +// stores may be removed without changing observable semantics. +// +// This pass uses standard equivalence class infrastructure to create +// maximal webs of computations fitting the above description. Each +// such web is then optimized by removing its unnecessary xxswapd +// instructions. +// +// There are some lane-sensitive operations for which we can still +// permit the optimization, provided we modify those operations +// accordingly. Such operations are identified as using "special +// handling" within this module. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-vsx-swaps" + +namespace llvm { + void initializePPCVSXSwapRemovalPass(PassRegistry&); + namespace PPC { + int opcodeIsLaneSensitive(uint16_t Opcode); + } +} + +namespace { + +// A PPCVSXSwapEntry is created for each machine instruction that +// is relevant to a vector computation. +struct PPCVSXSwapEntry { + // Pointer to the instruction. + MachineInstr *VSEMI; + + // Unique ID (position in the swap vector). + int VSEId; + + // Attributes of this node. + unsigned int IsLoad : 1; + unsigned int IsStore : 1; + unsigned int IsSwap : 1; + unsigned int MentionsPhysVR : 1; + unsigned int HasImplicitSubreg : 1; + unsigned int IsSwappable : 1; + unsigned int SpecialHandling : 3; + unsigned int WebRejected : 1; + unsigned int WillRemove : 1; +}; + +enum SHValues { + SH_NONE = 0, + SH_BUILDVEC, + SH_EXTRACT, + SH_INSERT, + SH_NOSWAP_LD, + SH_NOSWAP_ST, + SH_SPLAT +}; + +struct PPCVSXSwapRemoval : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + // Swap entries are allocated in a vector for better performance. + std::vector SwapVector; + + // A mapping is maintained between machine instructions and + // their swap entries. The key is the address of the MI. + DenseMap SwapMap; + + // Equivalence classes are used to gather webs of related computation. + // Swap entries are represented by their VSEId fields. + EquivalenceClasses *EC; + + PPCVSXSwapRemoval() : MachineFunctionPass(ID) { + initializePPCVSXSwapRemovalPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize data structures. + void initialize(MachineFunction &MFParm); + + // Walk the machine instructions to gather vector usage information. + // Return true iff vector mentions are present. + bool gatherVectorInstructions(); + + // Add an entry to the swap vector and swap map. + int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry); + + // Hunt backwards through COPY and SUBREG_TO_REG chains for a + // source register. VecIdx indicates the swap vector entry to + // mark as mentioning a physical register if the search leads + // to one. + unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx); + + // Generate equivalence classes for related computations (webs). + void formWebs(); + + // Analyze webs and determine those that cannot be optimized. + void recordUnoptimizableWebs(); + + // Record which swap instructions can be safely removed. + void markSwapsForRemoval(); + + // Remove swaps and update other instructions requiring special + // handling. Return true iff any changes are made. + bool removeSwaps(); + + // Update instructions requiring special handling. + void handleSpecialSwappables(int EntryIdx); + + // Dump a description of the entries in the swap vector. + void dumpSwapVector(); + + // Return true iff the given register is in the given class. + bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return RC->hasSubClassEq(MRI->getRegClass(Reg)); + if (RC->contains(Reg)) + return true; + return false; + } + + // Return true iff the given register is a full vector register. + bool isVecReg(unsigned Reg) { + return (isRegInClass(Reg, &PPC::VSRCRegClass) || + isRegInClass(Reg, &PPC::VRRCRegClass)); + } + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + // If we don't have VSX on the subtarget, don't do anything. + const PPCSubtarget &STI = MF.getSubtarget(); + if (!STI.hasVSX()) + return false; + + bool Changed = false; + initialize(MF); + + if (gatherVectorInstructions()) { + formWebs(); + recordUnoptimizableWebs(); + markSwapsForRemoval(); + Changed = removeSwaps(); + } + + // FIXME: See the allocation of EC in initialize(). + delete EC; + return Changed; + } +}; + +// Initialize data structures for this pass. In particular, clear the +// swap vector and allocate the equivalence class mapping before +// processing each function. +void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = static_cast(MF->getSubtarget().getInstrInfo()); + + // An initial vector size of 256 appears to work well in practice. + // Small/medium functions with vector content tend not to incur a + // reallocation at this size. Three of the vector tests in + // projects/test-suite reallocate, which seems like a reasonable rate. + const int InitialVectorSize(256); + SwapVector.clear(); + SwapVector.reserve(InitialVectorSize); + + // FIXME: Currently we allocate EC each time because we don't have + // access to the set representation on which to call clear(). Should + // consider adding a clear() method to the EquivalenceClasses class. + EC = new EquivalenceClasses; +} + +// Create an entry in the swap vector for each instruction that mentions +// a full vector register, recording various characteristics of the +// instructions there. +bool PPCVSXSwapRemoval::gatherVectorInstructions() { + bool RelevantFunction = false; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + + bool RelevantInstr = false; + bool ImplicitSubreg = false; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (isVecReg(Reg)) { + RelevantInstr = true; + if (MO.getSubReg() != 0) + ImplicitSubreg = true; + break; + } + } + + if (!RelevantInstr) + continue; + + RelevantFunction = true; + + // Create a SwapEntry initialized to zeros, then fill in the + // instruction and ID fields before pushing it to the back + // of the swap vector. + PPCVSXSwapEntry SwapEntry{}; + int VecIdx = addSwapEntry(&MI, SwapEntry); + + if (ImplicitSubreg) + SwapVector[VecIdx].HasImplicitSubreg = 1; + + switch(MI.getOpcode()) { + default: + // Certain opcodes always kill the optimization. For these, + // we do nothing, causing the optimization to fail. These + // opcodes are identified using the LaneSensitive designation. + if (PPC::opcodeIsLaneSensitive(MI.getOpcode()) != -1) + break; + + // Unless noted otherwise, an instruction is considered + // safe for the optimization. There are a large number of + // such true-SIMD instructions (all vector math, logical, + // select, compare, etc.). + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::XXPERMDI: + // This is a swap if it is of the form XXPERMDI t, s, s, 2. + // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we + // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2, + // for example. We have to look through chains of COPY and + // SUBREG_TO_REG to find the real source value for comparison. + // If the real source value is a physical register, then mark the + // XXPERMDI as mentioning a physical register. + // Any other form of XXPERMDI is lane-sensitive and unsafe + // for the optimization. + if (MI.getOperand(3).getImm() == 2) { + unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(), + VecIdx); + unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(), + VecIdx); + if (trueReg1 == trueReg2) + SwapVector[VecIdx].IsSwap = 1; + } + break; + case PPC::LVX: + // Non-permuting loads are currently unsafe. We can use special + // handling for this in the future. By not marking these as + // IsSwap, we ensure computations containing them will be rejected + // for now. + SwapVector[VecIdx].IsLoad = 1; + break; + case PPC::LXVD2X: + case PPC::LXVW4X: + // Permuting loads are marked as both load and swap, and are + // safe for optimization. + SwapVector[VecIdx].IsLoad = 1; + SwapVector[VecIdx].IsSwap = 1; + break; + case PPC::STVX: + // Non-permuting stores are currently unsafe. We can use special + // handling for this in the future. By not marking these as + // IsSwap, we ensure computations containing them will be rejected + // for now. + SwapVector[VecIdx].IsStore = 1; + break; + case PPC::STXVD2X: + case PPC::STXVW4X: + // Permuting stores are marked as both store and swap, and are + // safe for optimization. + SwapVector[VecIdx].IsStore = 1; + SwapVector[VecIdx].IsSwap = 1; + break; + case PPC::SUBREG_TO_REG: + // These are fine provided they are moving between full vector + // register classes. For example, the VRs are a subset of the + // VSRs, but each VR and each VSR is a full 128-bit register. + if (isVecReg(MI.getOperand(0).getReg()) && + isVecReg(MI.getOperand(2).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::COPY: + // These are fine provided they are moving between full vector + // register classes. + if (isVecReg(MI.getOperand(0).getReg()) && + isVecReg(MI.getOperand(1).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::VSPLTB: + case PPC::VSPLTH: + case PPC::VSPLTW: + // Splats are lane-sensitive, but we can use special handling + // to adjust the source lane for the splat. This is not yet + // implemented. When it is, we need to uncomment the following: + // SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT; + break; + // The presence of the following lane-sensitive operations in a + // web will kill the optimization, at least for now. + // FIXME: Some of these could be permitted with special handling, + // and will be phased in as time permits. + case PPC::INLINEASM: + case PPC::EXTRACT_SUBREG: + case PPC::INSERT_SUBREG: + case PPC::COPY_TO_REGCLASS: + break; + } + } + } + + if (RelevantFunction) { + DEBUG(dbgs() << "Swap vector when first built\n\n"); + dumpSwapVector(); + } + + return RelevantFunction; +} + +// Add an entry to the swap vector and swap map, and make a +// singleton equivalence class for the entry. +int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI, + PPCVSXSwapEntry& SwapEntry) { + SwapEntry.VSEMI = MI; + SwapEntry.VSEId = SwapVector.size(); + SwapVector.push_back(SwapEntry); + EC->insert(SwapEntry.VSEId); + SwapMap[MI] = SwapEntry.VSEId; + return SwapEntry.VSEId; +} + +// This is used to find the "true" source register for an +// XXPERMDI instruction, since MachineCSE does not handle the +// "copy-like" operations (Copy and SubregToReg). Returns +// the original SrcReg unless it is the target of a copy-like +// operation, in which case we chain backwards through all +// such operations to the ultimate source register. If a +// physical register is encountered, we stop the search and +// flag the swap entry indicated by VecIdx (the original +// XXPERMDI) as mentioning a physical register. Similarly +// for implicit subregister mentions (which should never +// happen). +unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg, + unsigned VecIdx) { + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg, CopySrcSubreg; + if (MI->isCopy()) { + CopySrcReg = MI->getOperand(1).getReg(); + CopySrcSubreg = MI->getOperand(1).getSubReg(); + } else { + assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike"); + CopySrcReg = MI->getOperand(2).getReg(); + CopySrcSubreg = MI->getOperand(2).getSubReg(); + } + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) { + SwapVector[VecIdx].MentionsPhysVR = 1; + return CopySrcReg; + } + + if (CopySrcSubreg != 0) { + SwapVector[VecIdx].HasImplicitSubreg = 1; + return CopySrcReg; + } + + return lookThruCopyLike(CopySrcReg, VecIdx); +} + +// Generate equivalence classes for related computations (webs) by +// def-use relationships of virtual registers. Mention of a physical +// register terminates the generation of equivalence classes as this +// indicates a use of a parameter, definition of a return value, use +// of a value returned from a call, or definition of a parameter to a +// call. Computations with physical register mentions are flagged +// as such so their containing webs will not be optimized. +void PPCVSXSwapRemoval::formWebs() { + + DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + + DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " "); + DEBUG(MI->dump()); + + // It's sufficient to walk vector uses and join them to their unique + // definitions. In addition, check *all* vector register operands + // for physical regs. + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + if (!isVecReg(Reg)) + continue; + + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + SwapVector[EntryIdx].MentionsPhysVR = 1; + continue; + } + + if (!MO.isUse()) + continue; + + MachineInstr* DefMI = MRI->getVRegDef(Reg); + assert(SwapMap.find(DefMI) != SwapMap.end() && + "Inconsistency: def of vector reg not found in swap map!"); + int DefIdx = SwapMap[DefMI]; + (void)EC->unionSets(SwapVector[DefIdx].VSEId, + SwapVector[EntryIdx].VSEId); + + DEBUG(dbgs() << format("Unioning %d with %d\n", SwapVector[DefIdx].VSEId, + SwapVector[EntryIdx].VSEId)); + DEBUG(dbgs() << " Def: "); + DEBUG(DefMI->dump()); + } + } +} + +// Walk the swap vector entries looking for conditions that prevent their +// containing computations from being optimized. When such conditions are +// found, mark the representative of the computation's equivalence class +// as rejected. +void PPCVSXSwapRemoval::recordUnoptimizableWebs() { + + DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + // Reject webs containing mentions of physical registers or implicit + // subregs, or containing operations that we don't know how to handle + // in a lane-permuted region. + if (SwapVector[EntryIdx].MentionsPhysVR || + SwapVector[EntryIdx].HasImplicitSubreg || + !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for physreg, subreg, or not swap[pable]\n", + Repr)); + DEBUG(dbgs() << " in " << EntryIdx << ": "); + DEBUG(SwapVector[EntryIdx].VSEMI->dump()); + DEBUG(dbgs() << "\n"); + } + + // Reject webs than contain swapping loads that feed something other + // than a swap instruction. + else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned DefReg = MI->getOperand(0).getReg(); + + // We skip debug instructions in the analysis. (Note that debug + // location information is still maintained by this optimization + // because it remains on the LXVD2X and STXVD2X instructions after + // the XXPERMDIs are removed.) + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { + int UseIdx = SwapMap[&UseMI]; + + if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad || + SwapVector[UseIdx].IsStore) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for load not feeding swap\n", Repr)); + DEBUG(dbgs() << " def " << EntryIdx << ": "); + DEBUG(MI->dump()); + DEBUG(dbgs() << " use " << UseIdx << ": "); + DEBUG(UseMI.dump()); + DEBUG(dbgs() << "\n"); + } + } + + // Reject webs than contain swapping stores that are fed by something + // other than a swap instruction. + } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned UseReg = MI->getOperand(0).getReg(); + MachineInstr *DefMI = MRI->getVRegDef(UseReg); + int DefIdx = SwapMap[DefMI]; + + if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad || + SwapVector[DefIdx].IsStore) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for store not fed by swap\n", Repr)); + DEBUG(dbgs() << " def " << DefIdx << ": "); + DEBUG(DefMI->dump()); + DEBUG(dbgs() << " use " << EntryIdx << ": "); + DEBUG(MI->dump()); + DEBUG(dbgs() << "\n"); + } + } + } + + DEBUG(dbgs() << "Swap vector after web analysis:\n\n"); + dumpSwapVector(); +} + +// Walk the swap vector entries looking for swaps fed by permuting loads +// and swaps that feed permuting stores. If the containing computation +// has not been marked rejected, mark each such swap for removal. +// (Removal is delayed in case optimization has disturbed the pattern, +// such that multiple loads feed the same swap, etc.) +void PPCVSXSwapRemoval::markSwapsForRemoval() { + + DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + if (!SwapVector[Repr].WebRejected) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned DefReg = MI->getOperand(0).getReg(); + + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { + int UseIdx = SwapMap[&UseMI]; + SwapVector[UseIdx].WillRemove = 1; + + DEBUG(dbgs() << "Marking swap fed by load for removal: "); + DEBUG(UseMI.dump()); + } + } + + } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + if (!SwapVector[Repr].WebRejected) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned UseReg = MI->getOperand(0).getReg(); + MachineInstr *DefMI = MRI->getVRegDef(UseReg); + int DefIdx = SwapMap[DefMI]; + SwapVector[DefIdx].WillRemove = 1; + + DEBUG(dbgs() << "Marking swap feeding store for removal: "); + DEBUG(DefMI->dump()); + } + + } else if (SwapVector[EntryIdx].IsSwappable && + SwapVector[EntryIdx].SpecialHandling != 0) + handleSpecialSwappables(EntryIdx); + } +} + +// The identified swap entry requires special handling to allow its +// containing computation to be optimized. Perform that handling +// here. +// FIXME: This code is to be phased in with subsequent patches. +void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { +} + +// Walk the swap vector and replace each entry marked for removal with +// a copy operation. +bool PPCVSXSwapRemoval::removeSwaps() { + + DEBUG(dbgs() << "\n*** Removing swaps ***\n\n"); + + bool Changed = false; + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + if (SwapVector[EntryIdx].WillRemove) { + Changed = true; + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + MachineBasicBlock *MBB = MI->getParent(); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addOperand(MI->getOperand(1)); + + DEBUG(dbgs() << format("Replaced %d with copy: ", + SwapVector[EntryIdx].VSEId)); + DEBUG(MI->dump()); + + MI->eraseFromParent(); + } + } + + return Changed; +} + +// For debug purposes, dump the contents of the swap vector. +void PPCVSXSwapRemoval::dumpSwapVector() { + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + int ID = SwapVector[EntryIdx].VSEId; + + DEBUG(dbgs() << format("%6d", ID)); + DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID))); + DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber())); + DEBUG(dbgs() << format(" %14s ", TII->getName(MI->getOpcode()))); + + if (SwapVector[EntryIdx].IsLoad) + DEBUG(dbgs() << "load "); + if (SwapVector[EntryIdx].IsStore) + DEBUG(dbgs() << "store "); + if (SwapVector[EntryIdx].IsSwap) + DEBUG(dbgs() << "swap "); + if (SwapVector[EntryIdx].MentionsPhysVR) + DEBUG(dbgs() << "physreg "); + if (SwapVector[EntryIdx].HasImplicitSubreg) + DEBUG(dbgs() << "implsubreg "); + + if (SwapVector[EntryIdx].IsSwappable) { + DEBUG(dbgs() << "swappable "); + switch(SwapVector[EntryIdx].SpecialHandling) { + default: + DEBUG(dbgs() << "special:**unknown**"); + break; + case SH_NONE: + break; + case SH_BUILDVEC: + DEBUG(dbgs() << "special:buildvec "); + break; + case SH_EXTRACT: + DEBUG(dbgs() << "special:extract "); + break; + case SH_INSERT: + DEBUG(dbgs() << "special:insert "); + break; + case SH_NOSWAP_LD: + DEBUG(dbgs() << "special:load "); + break; + case SH_NOSWAP_ST: + DEBUG(dbgs() << "special:store "); + break; + case SH_SPLAT: + DEBUG(dbgs() << "special:splat "); + break; + } + } + + if (SwapVector[EntryIdx].WebRejected) + DEBUG(dbgs() << "rejected "); + if (SwapVector[EntryIdx].WillRemove) + DEBUG(dbgs() << "remove "); + + DEBUG(dbgs() << "\n"); + } + + DEBUG(dbgs() << "\n"); +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE, + "PowerPC VSX Swap Removal", false, false) +INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE, + "PowerPC VSX Swap Removal", false, false) + +char PPCVSXSwapRemoval::ID = 0; +FunctionPass* +llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); } Index: test/CodeGen/PowerPC/swaps-le-1.ll =================================================================== --- test/CodeGen/PowerPC/swaps-le-1.ll +++ test/CodeGen/PowerPC/swaps-le-1.ll @@ -0,0 +1,147 @@ +; RUN: llc -O3 -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -O3 -mcpu=pwr8 -disable-ppc-vsx-swap-removal -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck -check-prefix=NOOPTSWAP %s + +; This test was generated from the following source: +; +; #define N 4096 +; int ca[N] __attribute__((aligned(16))); +; int cb[N] __attribute__((aligned(16))); +; int cc[N] __attribute__((aligned(16))); +; int cd[N] __attribute__((aligned(16))); +; +; void foo () +; { +; int i; +; for (i = 0; i < N; i++) { +; ca[i] = (cb[i] + cc[i]) * cd[i]; +; } +; } + +@cb = common global [4096 x i32] zeroinitializer, align 16 +@cc = common global [4096 x i32] zeroinitializer, align 16 +@cd = common global [4096 x i32] zeroinitializer, align 16 +@ca = common global [4096 x i32] zeroinitializer, align 16 + +define void @foo() { +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next.3, %vector.body ] + %0 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 16 + %2 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index + %3 = bitcast i32* %2 to <4 x i32>* + %wide.load13 = load <4 x i32>, <4 x i32>* %3, align 16 + %4 = add nsw <4 x i32> %wide.load13, %wide.load + %5 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index + %6 = bitcast i32* %5 to <4 x i32>* + %wide.load14 = load <4 x i32>, <4 x i32>* %6, align 16 + %7 = mul nsw <4 x i32> %4, %wide.load14 + %8 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index + %9 = bitcast i32* %8 to <4 x i32>* + store <4 x i32> %7, <4 x i32>* %9, align 16 + %index.next = add nuw nsw i64 %index, 4 + %10 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next + %11 = bitcast i32* %10 to <4 x i32>* + %wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 16 + %12 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next + %13 = bitcast i32* %12 to <4 x i32>* + %wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 16 + %14 = add nsw <4 x i32> %wide.load13.1, %wide.load.1 + %15 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next + %16 = bitcast i32* %15 to <4 x i32>* + %wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 16 + %17 = mul nsw <4 x i32> %14, %wide.load14.1 + %18 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next + %19 = bitcast i32* %18 to <4 x i32>* + store <4 x i32> %17, <4 x i32>* %19, align 16 + %index.next.1 = add nuw nsw i64 %index.next, 4 + %20 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.1 + %21 = bitcast i32* %20 to <4 x i32>* + %wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 16 + %22 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.1 + %23 = bitcast i32* %22 to <4 x i32>* + %wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 16 + %24 = add nsw <4 x i32> %wide.load13.2, %wide.load.2 + %25 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.1 + %26 = bitcast i32* %25 to <4 x i32>* + %wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 16 + %27 = mul nsw <4 x i32> %24, %wide.load14.2 + %28 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.1 + %29 = bitcast i32* %28 to <4 x i32>* + store <4 x i32> %27, <4 x i32>* %29, align 16 + %index.next.2 = add nuw nsw i64 %index.next.1, 4 + %30 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.2 + %31 = bitcast i32* %30 to <4 x i32>* + %wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 16 + %32 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.2 + %33 = bitcast i32* %32 to <4 x i32>* + %wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 16 + %34 = add nsw <4 x i32> %wide.load13.3, %wide.load.3 + %35 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.2 + %36 = bitcast i32* %35 to <4 x i32>* + %wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 16 + %37 = mul nsw <4 x i32> %34, %wide.load14.3 + %38 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.2 + %39 = bitcast i32* %38 to <4 x i32>* + store <4 x i32> %37, <4 x i32>* %39, align 16 + %index.next.3 = add nuw nsw i64 %index.next.2, 4 + %40 = icmp eq i64 %index.next.3, 4096 + br i1 %40, label %for.end, label %vector.body + +for.end: + ret void +} + +; CHECK-LABEL: @foo +; CHECK-NOT: xxpermdi +; CHECK-NOT: xxswapd + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x + + +; NOOPTSWAP-LABEL: @foo + +; NOOPTSWAP: lxvd2x +; NOOPTSWAP-DAG: lxvd2x +; NOOPTSWAP-DAG: lxvd2x +; NOOPTSWAP-DAG: xxpermdi +; NOOPTSWAP-DAG: xxpermdi +; NOOPTSWAP-DAG: xxpermdi +; NOOPTSWAP-DAG: vadduwm +; NOOPTSWAP: vmuluwm +; NOOPTSWAP: xxpermdi +; NOOPTSWAP-DAG: xxpermdi +; NOOPTSWAP-DAG: xxpermdi +; NOOPTSWAP-DAG: stxvd2x +; NOOPTSWAP-DAG: stxvd2x +; NOOPTSWAP: stxvd2x + Index: test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll =================================================================== --- test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll +++ test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll @@ -1,7 +1,6 @@ ; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t ; RUN: grep lxvd2x < %t | count 18 ; RUN: grep stxvd2x < %t | count 18 -; RUN: grep xxpermdi < %t | count 36 @vf = global <4 x float> , align 16 @vd = global <2 x double> , align 16 Index: test/CodeGen/PowerPC/vsx-ldst.ll =================================================================== --- test/CodeGen/PowerPC/vsx-ldst.ll +++ test/CodeGen/PowerPC/vsx-ldst.ll @@ -12,7 +12,6 @@ ; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t ; RUN: grep lxvd2x < %t | count 6 ; RUN: grep stxvd2x < %t | count 6 -; RUN: grep xxpermdi < %t | count 12 @vsi = global <4 x i32> , align 16 @vui = global <4 x i32> , align 16