diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -91,6 +91,11 @@ // 401, 403, 405, 405fp, 440fp, 464, 464fp, 476, 476fp, 505, 740, 801, // 821, 823, 8540, e300c2, e300c3, e500mc64, e6500, 860, cell, titan, rs64. bool isValidCPUName(StringRef Name) const override; + bool isCPUP10OrNewer(StringRef Name) { + return llvm::StringSwitch(Name) + .Cases("power10", "pwr10", "future", true) + .Default(false); + } void fillValidCPUList(SmallVectorImpl &Values) const override; bool setCPU(const std::string &Name) override { @@ -403,6 +408,7 @@ LongWidth = LongAlign = PointerWidth = PointerAlign = 64; IntMaxType = SignedLong; Int64Type = SignedLong; + bool IsP10 = isCPUP10OrNewer(Opts.CPU); if (Triple.isOSAIX()) { // TODO: Set appropriate ABI for AIX platform. @@ -412,10 +418,12 @@ LongDoubleAlign = DoubleAlign = 32; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } else if ((Triple.getArch() == llvm::Triple::ppc64le)) { - resetDataLayout("e-m:e-i64:64-n32:64"); + resetDataLayout(IsP10 ? "e-m:e-i64:64-n32:64-v256:128:128-v512:128:128" + : "e-m:e-i64:64-n32:64"); ABI = "elfv2"; } else { - resetDataLayout("E-m:e-i64:64-n32:64"); + resetDataLayout(IsP10 ? "E-m:e-i64:64-n32:64-v256:128:128-v512:128:128" + : "E-m:e-i64:64-n32:64"); ABI = "elfv1"; } diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -134,10 +134,26 @@ // RUN: FileCheck %s -check-prefix=PPC64-LINUX // PPC64-LINUX: target datalayout = "E-m:e-i64:64-n32:64" +// RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu future %s | \ +// RUN: FileCheck %s -check-prefix=PPC64-FUTURE +// PPC64-FUTURE: target datalayout = "E-m:e-i64:64-n32:64-v256:128:128-v512:128:128" + +// RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu pwr10 %s | \ +// RUN: FileCheck %s -check-prefix=PPC64-P10 +// PPC64-P10: target datalayout = "E-m:e-i64:64-n32:64-v256:128:128-v512:128:128" + // RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=PPC64LE-LINUX // PPC64LE-LINUX: target datalayout = "e-m:e-i64:64-n32:64" +// RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu future %s | \ +// RUN: FileCheck %s -check-prefix=PPC64LE-FUTURE +// PPC64LE-FUTURE: target datalayout = "e-m:e-i64:64-n32:64-v256:128:128-v512:128:128" + +// RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu pwr10 %s | \ +// RUN: FileCheck %s -check-prefix=PPC64LE-P10 +// PPC64LE-P10: target datalayout = "e-m:e-i64:64-n32:64-v256:128:128-v512:128:128" + // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX // NVPTX: target datalayout = "e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -436,6 +436,21 @@ /// PLD. MAT_PCREL_ADDR, + /// ACC_BUILD = Build an accumulator register from 4 VSX registers. + ACC_BUILD, + + /// PAIR_BUILD = Build a vector pair register from 2 VSX registers. + PAIR_BUILD, + + /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of + /// an accumulator or pair register. This node is needed because + /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same + /// element type. + EXTRACT_VSX_REG, + + /// XXMFACC = This corresponds to the xxmfacc instruction. + XXMFACC, + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a /// byte-swapping store instruction. It byte-swaps the low "Type" bits of /// the GPRC input, then stores it through Ptr. Type can be either i16 or diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1096,6 +1096,15 @@ } } + if (Subtarget.hasMMA()) { + addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); + addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass); + setOperationAction(ISD::LOAD, MVT::v512i1, Custom); + setOperationAction(ISD::LOAD, MVT::v256i1, Custom); + setOperationAction(ISD::STORE, MVT::v512i1, Custom); + setOperationAction(ISD::STORE, MVT::v256i1, Custom); + } + if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); @@ -1430,6 +1439,10 @@ case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF"; case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR"; + case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD"; + case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD"; + case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG"; + case PPCISD::XXMFACC: return "PPCISD::XXMFACC"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; } @@ -7684,6 +7697,8 @@ } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType().isVector()) + return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); @@ -7707,6 +7722,9 @@ } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + if (Op.getOperand(1).getValueType().isVector()) + return LowerVectorStore(Op, DAG); + assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); @@ -10230,6 +10248,90 @@ return Op; } +SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + LoadSDNode *LN = cast(Op.getNode()); + SDValue LoadChain = LN->getChain(); + SDValue BasePtr = LN->getBasePtr(); + EVT VT = Op.getValueType(); + + // Type v256i1 is used for pairs and v512i1 is used for accumulators. + // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in + // 2 or 4 vsx registers. + if (VT == MVT::v256i1 || VT == MVT::v512i1) { + assert(Subtarget.hasMMA() && "Type unsupported without MMA"); + Align Alignment = LN->getAlign(); + SmallVector Loads; + SmallVector LoadChains; + unsigned NumVecs = VT.getSizeInBits() / 128; + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + SDValue Load = + DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr, + LN->getPointerInfo().getWithOffset(Idx * 16), + commonAlignment(Alignment, Idx * 16), + LN->getMemOperand()->getFlags(), LN->getAAInfo()); + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(16, dl, BasePtr.getValueType())); + Loads.push_back(Load); + LoadChains.push_back(Load.getValue(1)); + } + if (Subtarget.isLittleEndian()) { + std::reverse(Loads.begin(), Loads.end()); + std::reverse(LoadChains.begin(), LoadChains.end()); + } + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Value = + DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD, + dl, VT, Loads); + SDValue RetOps[] = {Value, TF}; + return DAG.getMergeValues(RetOps, dl); + } + + return Op; +} + +SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + StoreSDNode *SN = cast(Op.getNode()); + SDValue StoreChain = SN->getChain(); + SDValue BasePtr = SN->getBasePtr(); + SDValue Value = SN->getValue(); + EVT StoreVT = Value.getValueType(); + + // Type v256i1 is used for pairs and v512i1 is used for accumulators. + // Here we create 2 or 4 v16i8 stores to store the pair or accumulator + // underlying registers individually. + if (StoreVT == MVT::v256i1 || StoreVT == MVT::v512i1) { + assert(Subtarget.hasMMA() && "Type unsupported without MMA"); + Align Alignment = SN->getAlign(); + SmallVector Stores; + unsigned NumVecs = 2; + if (StoreVT == MVT::v512i1) { + Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); + NumVecs = 4; + } + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; + SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, + DAG.getConstant(VecNum, dl, MVT::i64)); + SDValue Store = + DAG.getStore(StoreChain, dl, Elt, BasePtr, + SN->getPointerInfo().getWithOffset(Idx * 16), + commonAlignment(Alignment, Idx * 16), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(16, dl, BasePtr.getValueType())); + Stores.push_back(Store); + } + SDValue TF = DAG.getTokenFactor(dl, Stores); + return TF; + } + + return Op; +} + SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2342,6 +2342,31 @@ auto DL = MI.getDebugLoc(); switch (MI.getOpcode()) { + case PPC::BUILD_UACC: { + MCRegister ACC = MI.getOperand(0).getReg(); + MCRegister UACC = MI.getOperand(1).getReg(); + if (ACC - PPC::ACC0 != UACC - PPC::UACC0) { + MCRegister SrcVSR = PPC::VSL0 + (UACC - PPC::UACC0) * 4; + MCRegister DstVSR = PPC::VSL0 + (ACC - PPC::ACC0) * 4; + // FIXME: This can easily be improved to look up to the top of the MBB + // to see if the inputs are XXLOR's. If they are and SrcReg is killed, + // we can just re-target any such XXLOR's to DstVSR + offset. + for (int VecNo = 0; VecNo < 4; VecNo++) + BuildMI(MBB, MI, DL, get(PPC::XXLOR), DstVSR + VecNo) + .addReg(SrcVSR + VecNo) + .addReg(SrcVSR + VecNo); + } + // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers. + // So after building the 4 copies, we can replace the BUILD_UACC instruction + // with a NOP. + LLVM_FALLTHROUGH; + } + case PPC::KILL_PAIR: { + MI.setDesc(get(PPC::UNENCODED_NOP)); + MI.RemoveOperand(1); + MI.RemoveOperand(0); + return true; + } case TargetOpcode::LOAD_STACK_GUARD: { assert(Subtarget.isTargetLinux() && "Only Linux target is expected to contain LOAD_STACK_GUARD"); diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -610,6 +610,10 @@ let ParserMatchClass = PPCRegACCRCAsmOperand; } +def uacc : RegisterOperand { + let ParserMatchClass = PPCRegACCRCAsmOperand; +} + def PPCRegVSRpRCAsmOperand : AsmOperandClass { let Name = "RegVSRpRC"; let PredicateMethod = "isVSRpEvenRegNumber"; } @@ -619,6 +623,37 @@ let PrintMethod = "printVSRpEvenReg"; } +def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass { + let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber"; +} + +def vsrpevenrc : RegisterOperand { + let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand; + let PrintMethod = "printVSRpEvenReg"; + let EncoderMethod = "getVSRpEvenEncoding"; + let DecoderMethod = "decodeVSRpEvenOperands"; +} + +def SDT_PPCAccBuild : SDTypeProfile<1, 4, [ + SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>, + SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32> +]>; +def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [ + SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2> +]>; +def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [ + SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2> +]>; +def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [ + SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1> +]>; +def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>; +def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx, + []>; +def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx, + []>; +def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>; + // [PO AS XO2 XO] class XForm_AT3 opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> @@ -705,6 +740,11 @@ XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT", IIC_VecGeneral, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp), + "#KILL_PAIR", []>, + RegConstraint<"$XTp = $XSp">; + def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS), + "#BUILD_UACC $AT, $AS", []>; // We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in // the backend. We avoid CSE here because it generates a copy of the acc // register and this copy is more expensive than calling the intrinsic again. @@ -715,6 +755,49 @@ } } +def Concats { + dag VecsToVecPair0 = + (v256i1 (INSERT_SUBREG + (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1), + $vs1, sub_vsx0)); + dag VecsToVecPair1 = + (v256i1 (INSERT_SUBREG + (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1), + $vs3, sub_vsx0)); + dag VecsToVecQuad = + (BUILD_UACC (INSERT_SUBREG + (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)), + (KILL_PAIR VecsToVecPair0), sub_pair0), + (KILL_PAIR VecsToVecPair1), sub_pair1)); +} + +def Extracts { + dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0)); + dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1)); + dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0)); + dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1)); + dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0)); + dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1)); +} + +let Predicates = [MMA] in { + def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)), + (XXMTACC Concats.VecsToVecQuad)>; + def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))), + Extracts.Vec0>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))), + Extracts.Vec1>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))), + Extracts.Vec2>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))), + Extracts.Vec3>; + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))), + (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>; + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))), + (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>; +} + let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in { def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp), (ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA", diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -106,6 +106,15 @@ let SubRegs = subregs; } +// UACC - One of the 8 512-bit VSX accumulators prior to being primed. +// Without using this register class, the register allocator has no way to +// differentiate a primed accumulator from an unprimed accumulator. +// This may result in invalid copies between primed and unprimed accumulators. +class UACC num, string n, list subregs> : PPCReg { + let HWEncoding{2-0} = num; + let SubRegs = subregs; +} + // VSR Pairs - One of the 32 paired even-odd consecutive VSRs. class VSRPair num, string n, list subregs> : PPCReg { let HWEncoding{4-0} = num; @@ -193,6 +202,22 @@ let Size = 512; } +let SubRegIndices = [sub_pair0, sub_pair1] in { + def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>; + def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>; + def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>; + def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>; + def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>; + def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>; + def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>; + def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>; +} +def UACCRC : RegisterClass<"PPC", [v512i1], 128, + (add UACC0, UACC1, UACC2, UACC3, + UACC4, UACC5, UACC6, UACC7)> { + let Size = 512; +} + // The representation of r0 when treated as the constant 0. def ZERO : GPR<0, "0">, DwarfRegAlias; def ZERO8 : GP8, DwarfRegAlias; diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -119,7 +119,7 @@ } /// Return the datalayout string of a subtarget. -static std::string getDataLayoutString(const Triple &T) { +static std::string getDataLayoutString(const Triple &T, StringRef CPU) { bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le; std::string Ret; @@ -149,6 +149,16 @@ else Ret += "-n32"; + // Specify the vector alignment explicitly. For v256i1 and v512i1, the + // calculated alignment would be 256*alignment(i1) and 512*alignment(i1), + // which is 256 and 512 bytes - way over aligned. + bool IsCPUP10OrNewer = llvm::StringSwitch(CPU) + .Cases("power10", "pwr10", "future", true) + .Default(false); + if (IsCPUP10OrNewer && + (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64)) + Ret += "-v256:128:128-v512:128:128"; + return Ret; } @@ -300,7 +310,7 @@ Optional RM, Optional CM, CodeGenOpt::Level OL, bool JIT) - : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU, + : LLVMTargetMachine(T, getDataLayoutString(TT, CPU), TT, CPU, computeFSAdditions(FS, OL, TT), Options, getEffectiveRelocModel(TT, RM), getEffectivePPCCodeModel(TT, CM, JIT), OL), diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=BE + +@f = common local_unnamed_addr global <512 x i1> zeroinitializer, align 16 + +define void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { +; LE-LABEL: testLdSt: +; LE: # %bb.0: # %entry +; LE-NEXT: plxv vs1, f@PCREL+96(0), 1 +; LE-NEXT: plxv vs0, f@PCREL+112(0), 1 +; LE-NEXT: plxv vs3, f@PCREL+64(0), 1 +; LE-NEXT: plxv vs2, f@PCREL+80(0), 1 +; LE-NEXT: xxmtacc acc0 +; LE-NEXT: xxmfacc acc0 +; LE-NEXT: pstxv vs0, f@PCREL+176(0), 1 +; LE-NEXT: pstxv vs1, f@PCREL+160(0), 1 +; LE-NEXT: pstxv vs2, f@PCREL+144(0), 1 +; LE-NEXT: pstxv vs3, f@PCREL+128(0), 1 +; LE-NEXT: blr +; +; BE-LABEL: testLdSt: +; BE: # %bb.0: # %entry +; BE-NEXT: addis r3, r2, .LC0@toc@ha +; BE-NEXT: ld r3, .LC0@toc@l(r3) +; BE-NEXT: lxv vs1, 80(r3) +; BE-NEXT: lxv vs0, 64(r3) +; BE-NEXT: lxv vs3, 112(r3) +; BE-NEXT: lxv vs2, 96(r3) +; BE-NEXT: xxmtacc acc0 +; BE-NEXT: xxmfacc acc0 +; BE-NEXT: stxv vs1, 144(r3) +; BE-NEXT: stxv vs0, 128(r3) +; BE-NEXT: stxv vs3, 176(r3) +; BE-NEXT: stxv vs2, 160(r3) +; BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 1 + %0 = load <512 x i1>, <512 x i1>* %arrayidx, align 16 + %arrayidx1 = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 2 + store <512 x i1> %0, <512 x i1>* %arrayidx1, align 16 + ret void +} + +define void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { +; LE-LABEL: testXLdSt: +; LE: # %bb.0: # %entry +; LE-NEXT: sldi r3, r3, 6 +; LE-NEXT: paddi r5, 0, f@PCREL, 1 +; LE-NEXT: add r6, r5, r3 +; LE-NEXT: lxv vs1, 32(r6) +; LE-NEXT: lxv vs0, 48(r6) +; LE-NEXT: lxvx vs3, r5, r3 +; LE-NEXT: lxv vs2, 16(r6) +; LE-NEXT: sldi r3, r4, 6 +; LE-NEXT: xxmtacc acc0 +; LE-NEXT: add r4, r5, r3 +; LE-NEXT: xxmfacc acc0 +; LE-NEXT: stxvx vs3, r5, r3 +; LE-NEXT: stxv vs0, 48(r4) +; LE-NEXT: stxv vs1, 32(r4) +; LE-NEXT: stxv vs2, 16(r4) +; LE-NEXT: blr +; +; BE-LABEL: testXLdSt: +; BE: # %bb.0: # %entry +; BE-NEXT: addis r5, r2, .LC0@toc@ha +; BE-NEXT: ld r5, .LC0@toc@l(r5) +; BE-NEXT: sldi r3, r3, 6 +; BE-NEXT: add r6, r5, r3 +; BE-NEXT: lxvx vs0, r5, r3 +; BE-NEXT: lxv vs1, 16(r6) +; BE-NEXT: lxv vs3, 48(r6) +; BE-NEXT: lxv vs2, 32(r6) +; BE-NEXT: sldi r3, r4, 6 +; BE-NEXT: xxmtacc acc0 +; BE-NEXT: add r4, r5, r3 +; BE-NEXT: xxmfacc acc0 +; BE-NEXT: stxvx vs0, r5, r3 +; BE-NEXT: stxv vs1, 16(r4) +; BE-NEXT: stxv vs3, 48(r4) +; BE-NEXT: stxv vs2, 32(r4) +; BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 %SrcIdx + %0 = load <512 x i1>, <512 x i1>* %arrayidx, align 16 + %arrayidx1 = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 %DstIdx + store <512 x i1> %0, <512 x i1>* %arrayidx1, align 16 + ret void +} + +define void @testUnalignedLdSt() { +; LE-LABEL: testUnalignedLdSt: +; LE: # %bb.0: # %entry +; LE-NEXT: plxv vs1, f@PCREL+43(0), 1 +; LE-NEXT: plxv vs0, f@PCREL+59(0), 1 +; LE-NEXT: plxv vs3, f@PCREL+11(0), 1 +; LE-NEXT: plxv vs2, f@PCREL+27(0), 1 +; LE-NEXT: xxmtacc acc0 +; LE-NEXT: xxmfacc acc0 +; LE-NEXT: pstxv vs0, f@PCREL+67(0), 1 +; LE-NEXT: pstxv vs1, f@PCREL+51(0), 1 +; LE-NEXT: pstxv vs2, f@PCREL+35(0), 1 +; LE-NEXT: pstxv vs3, f@PCREL+19(0), 1 +; LE-NEXT: blr +; +; BE-LABEL: testUnalignedLdSt: +; BE: # %bb.0: # %entry +; BE-NEXT: addis r3, r2, .LC0@toc@ha +; BE-NEXT: ld r3, .LC0@toc@l(r3) +; BE-NEXT: li r4, 11 +; BE-NEXT: lxvx vs0, r3, r4 +; BE-NEXT: li r4, 27 +; BE-NEXT: lxvx vs1, r3, r4 +; BE-NEXT: li r4, 43 +; BE-NEXT: lxvx vs2, r3, r4 +; BE-NEXT: li r4, 59 +; BE-NEXT: lxvx vs3, r3, r4 +; BE-NEXT: li r4, 35 +; BE-NEXT: xxmtacc acc0 +; BE-NEXT: xxmfacc acc0 +; BE-NEXT: stxvx vs1, r3, r4 +; BE-NEXT: li r4, 19 +; BE-NEXT: stxvx vs0, r3, r4 +; BE-NEXT: li r4, 67 +; BE-NEXT: stxvx vs3, r3, r4 +; BE-NEXT: li r4, 51 +; BE-NEXT: stxvx vs2, r3, r4 +; BE-NEXT: blr +entry: + %0 = bitcast <512 x i1>* @f to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 11 + %add.ptr1 = getelementptr inbounds i8, i8* %0, i64 19 + %1 = bitcast i8* %add.ptr to <512 x i1>* + %2 = bitcast i8* %add.ptr1 to <512 x i1>* + %3 = load <512 x i1>, <512 x i1>* %1, align 1 + store <512 x i1> %3, <512 x i1>* %2, align 1 + ret void +}