Index: clang/lib/Basic/Targets/PPC.h =================================================================== --- clang/lib/Basic/Targets/PPC.h +++ clang/lib/Basic/Targets/PPC.h @@ -403,19 +403,20 @@ LongWidth = LongAlign = PointerWidth = PointerAlign = 64; IntMaxType = SignedLong; Int64Type = SignedLong; + std::string DataLayout = ""; if (Triple.isOSAIX()) { // TODO: Set appropriate ABI for AIX platform. - resetDataLayout("E-m:a-i64:64-n32:64"); + DataLayout = "E-m:a-i64:64-n32:64"; SuitableAlign = 64; LongDoubleWidth = 64; LongDoubleAlign = DoubleAlign = 32; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } else if ((Triple.getArch() == llvm::Triple::ppc64le)) { - resetDataLayout("e-m:e-i64:64-n32:64"); + DataLayout = "e-m:e-i64:64-n32:64"; ABI = "elfv2"; } else { - resetDataLayout("E-m:e-i64:64-n32:64"); + DataLayout = "E-m:e-i64:64-n32:64"; ABI = "elfv1"; } @@ -424,6 +425,11 @@ LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } + if (Triple.isOSAIX() || Triple.isOSLinux()) + DataLayout += "-v256:256:256-v512:512:512"; + + resetDataLayout(DataLayout); + // PPC64 supports atomics up to 8 bytes. MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64; } Index: clang/test/CodeGen/target-data.c =================================================================== --- clang/test/CodeGen/target-data.c +++ clang/test/CodeGen/target-data.c @@ -136,11 +136,27 @@ // RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=PPC64-LINUX -// PPC64-LINUX: target datalayout = "E-m:e-i64:64-n32:64" +// PPC64-LINUX: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + +// RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu future %s | \ +// RUN: FileCheck %s -check-prefix=PPC64-FUTURE +// PPC64-FUTURE: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + +// RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu pwr10 %s | \ +// RUN: FileCheck %s -check-prefix=PPC64-P10 +// PPC64-P10: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=PPC64LE-LINUX -// PPC64LE-LINUX: target datalayout = "e-m:e-i64:64-n32:64" +// PPC64LE-LINUX: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + +// RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu future %s | \ +// RUN: FileCheck %s -check-prefix=PPC64LE-FUTURE +// PPC64LE-FUTURE: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + +// RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu pwr10 %s | \ +// RUN: FileCheck %s -check-prefix=PPC64LE-P10 +// PPC64LE-P10: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -441,6 +441,21 @@ /// through an add like PADDI. TLS_DYNAMIC_MAT_PCREL_ADDR, + /// ACC_BUILD = Build an accumulator register from 4 VSX registers. + ACC_BUILD, + + /// PAIR_BUILD = Build a vector pair register from 2 VSX registers. + PAIR_BUILD, + + /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of + /// an accumulator or pair register. This node is needed because + /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same + /// element type. + EXTRACT_VSX_REG, + + /// XXMFACC = This corresponds to the xxmfacc instruction. + XXMFACC, + // Constrained conversion from floating point to int STRICT_FCTIDZ = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCTIWZ, Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1172,6 +1172,18 @@ } } + if (Subtarget.pairedVectorMemops()) { + addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass); + setOperationAction(ISD::LOAD, MVT::v256i1, Custom); + setOperationAction(ISD::STORE, MVT::v256i1, Custom); + } + if (Subtarget.hasMMA()) { + addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); + setOperationAction(ISD::LOAD, MVT::v512i1, Custom); + setOperationAction(ISD::STORE, MVT::v512i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom); + } + if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); @@ -1505,6 +1517,10 @@ case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR"; case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR: return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR"; + case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD"; + case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD"; + case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG"; + case PPCISD::XXMFACC: return "PPCISD::XXMFACC"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; case PPCISD::STRICT_FADDRTZ: @@ -7789,6 +7805,8 @@ } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType().isVector()) + return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); @@ -7812,6 +7830,9 @@ } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + if (Op.getOperand(1).getValueType().isVector()) + return LowerVectorStore(Op, DAG); + assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); @@ -10484,6 +10505,96 @@ return Op; } +SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + LoadSDNode *LN = cast(Op.getNode()); + SDValue LoadChain = LN->getChain(); + SDValue BasePtr = LN->getBasePtr(); + EVT VT = Op.getValueType(); + + // Type v256i1 is used for pairs and v512i1 is used for accumulators. + // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in + // 2 or 4 vsx registers. + if (VT == MVT::v256i1 || VT == MVT::v512i1) { + assert((VT != MVT::v512i1 || Subtarget.hasMMA()) && + "Type unsupported without MMA"); + assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) && + "Type unsupported without paired vector support"); + Align Alignment = LN->getAlign(); + SmallVector Loads; + SmallVector LoadChains; + unsigned NumVecs = VT.getSizeInBits() / 128; + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + SDValue Load = + DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr, + LN->getPointerInfo().getWithOffset(Idx * 16), + commonAlignment(Alignment, Idx * 16), + LN->getMemOperand()->getFlags(), LN->getAAInfo()); + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(16, dl, BasePtr.getValueType())); + Loads.push_back(Load); + LoadChains.push_back(Load.getValue(1)); + } + if (Subtarget.isLittleEndian()) { + std::reverse(Loads.begin(), Loads.end()); + std::reverse(LoadChains.begin(), LoadChains.end()); + } + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Value = + DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD, + dl, VT, Loads); + SDValue RetOps[] = {Value, TF}; + return DAG.getMergeValues(RetOps, dl); + } + + return Op; +} + +SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + StoreSDNode *SN = cast(Op.getNode()); + SDValue StoreChain = SN->getChain(); + SDValue BasePtr = SN->getBasePtr(); + SDValue Value = SN->getValue(); + EVT StoreVT = Value.getValueType(); + + // Type v256i1 is used for pairs and v512i1 is used for accumulators. + // Here we create 2 or 4 v16i8 stores to store the pair or accumulator + // underlying registers individually. + if (StoreVT == MVT::v256i1 || StoreVT == MVT::v512i1) { + assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) && + "Type unsupported without MMA"); + assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) && + "Type unsupported without paired vector support"); + Align Alignment = SN->getAlign(); + SmallVector Stores; + unsigned NumVecs = 2; + if (StoreVT == MVT::v512i1) { + Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); + NumVecs = 4; + } + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; + SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, + DAG.getConstant(VecNum, dl, MVT::i64)); + SDValue Store = + DAG.getStore(StoreChain, dl, Elt, BasePtr, + SN->getPointerInfo().getWithOffset(Idx * 16), + commonAlignment(Alignment, Idx * 16), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(16, dl, BasePtr.getValueType())); + Stores.push_back(Store); + } + SDValue TF = DAG.getTokenFactor(dl, Stores); + return TF; + } + + return Op; +} + SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { Index: llvm/lib/Target/PowerPC/PPCInstrInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2357,6 +2357,31 @@ auto DL = MI.getDebugLoc(); switch (MI.getOpcode()) { + case PPC::BUILD_UACC: { + MCRegister ACC = MI.getOperand(0).getReg(); + MCRegister UACC = MI.getOperand(1).getReg(); + if (ACC - PPC::ACC0 != UACC - PPC::UACC0) { + MCRegister SrcVSR = PPC::VSL0 + (UACC - PPC::UACC0) * 4; + MCRegister DstVSR = PPC::VSL0 + (ACC - PPC::ACC0) * 4; + // FIXME: This can easily be improved to look up to the top of the MBB + // to see if the inputs are XXLOR's. If they are and SrcReg is killed, + // we can just re-target any such XXLOR's to DstVSR + offset. + for (int VecNo = 0; VecNo < 4; VecNo++) + BuildMI(MBB, MI, DL, get(PPC::XXLOR), DstVSR + VecNo) + .addReg(SrcVSR + VecNo) + .addReg(SrcVSR + VecNo); + } + // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers. + // So after building the 4 copies, we can replace the BUILD_UACC instruction + // with a NOP. + LLVM_FALLTHROUGH; + } + case PPC::KILL_PAIR: { + MI.setDesc(get(PPC::UNENCODED_NOP)); + MI.RemoveOperand(1); + MI.RemoveOperand(0); + return true; + } case TargetOpcode::LOAD_STACK_GUARD: { assert(Subtarget.isTargetLinux() && "Only Linux target is expected to contain LOAD_STACK_GUARD"); Index: llvm/lib/Target/PowerPC/PPCInstrPrefix.td =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -6,11 +6,31 @@ SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3> ]>; +def SDT_PPCAccBuild : SDTypeProfile<1, 4, [ + SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>, + SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32> +]>; +def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [ + SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2> +]>; +def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [ + SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2> +]>; +def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [ + SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1> +]>; + //===----------------------------------------------------------------------===// // ISA 3.1 specific PPCISD nodes. // def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>; +def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>; +def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx, + []>; +def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx, + []>; +def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>; //===----------------------------------------------------------------------===// @@ -519,6 +539,17 @@ let PrintMethod = "printVSRpEvenReg"; } +def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass { + let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber"; +} + +def vsrpevenrc : RegisterOperand { + let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand; + let PrintMethod = "printVSRpEvenReg"; + let EncoderMethod = "getVSRpEvenEncoding"; + let DecoderMethod = "decodeVSRpEvenOperands"; +} + class DQForm_XTp5_RA17_MEM opcode, bits<4> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { @@ -588,6 +619,10 @@ let ParserMatchClass = PPCRegACCRCAsmOperand; } +def uacc : RegisterOperand { + let ParserMatchClass = PPCRegACCRCAsmOperand; +} + // [PO AS XO2 XO] class XForm_AT3 opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> @@ -768,6 +803,11 @@ XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT", IIC_VecGeneral, []>, RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp), + "#KILL_PAIR", []>, + RegConstraint<"$XTp = $XSp">; + def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS), + "#BUILD_UACC $AT, $AS", []>; // We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in // the backend. We avoid CSE here because it generates a copy of the acc // register and this copy is more expensive than calling the intrinsic again. @@ -778,6 +818,50 @@ } } +def Concats { + dag VecsToVecPair0 = + (v256i1 (INSERT_SUBREG + (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1), + $vs1, sub_vsx0)); + dag VecsToVecPair1 = + (v256i1 (INSERT_SUBREG + (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1), + $vs3, sub_vsx0)); + dag VecsToVecQuad = + (BUILD_UACC (INSERT_SUBREG + (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)), + (KILL_PAIR VecsToVecPair0), sub_pair0), + (KILL_PAIR VecsToVecPair1), sub_pair1)); +} + +def Extracts { + dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0)); + dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1)); + dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0)); + dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1)); + dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0)); + dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1)); +} + +let Predicates = [MMA] in { + def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)), + (XXMTACC Concats.VecsToVecQuad)>; + def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>; + def : Pat<(v512i1 immAllZerosV), (XXSETACCZ)>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))), + Extracts.Vec0>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))), + Extracts.Vec1>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))), + Extracts.Vec2>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))), + Extracts.Vec3>; + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))), + (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>; + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))), + (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>; +} + let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in { def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp), (ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA", Index: llvm/lib/Target/PowerPC/PPCRegisterInfo.td =================================================================== --- llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -104,6 +104,15 @@ let SubRegs = subregs; } +// UACC - One of the 8 512-bit VSX accumulators prior to being primed. +// Without using this register class, the register allocator has no way to +// differentiate a primed accumulator from an unprimed accumulator. +// This may result in invalid copies between primed and unprimed accumulators. +class UACC num, string n, list subregs> : PPCReg { + let HWEncoding{2-0} = num; + let SubRegs = subregs; +} + // VSR Pairs - One of the 32 paired even-odd consecutive VSRs. class VSRPair num, string n, list subregs> : PPCReg { let HWEncoding{4-0} = num; @@ -420,6 +429,22 @@ let Size = 512; } +let SubRegIndices = [sub_pair0, sub_pair1] in { + def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>; + def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>; + def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>; + def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>; + def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>; + def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>; + def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>; + def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>; +} +def UACCRC : RegisterClass<"PPC", [v512i1], 128, + (add UACC0, UACC1, UACC2, UACC3, + UACC4, UACC5, UACC6, UACC7)> { + let Size = 512; +} + // Allocate in the same order as the underlying VSX registers. def VSRpRC : RegisterClass<"PPC", [v4i64,v4f64,v8i32,v8f32,v16i16,v32i8,v256i1], 128, Index: llvm/lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -149,6 +149,13 @@ else Ret += "-n32"; + // Specify the vector alignment explicitly. For v256i1 and v512i1, the + // calculated alignment would be 256*alignment(i1) and 512*alignment(i1), + // which is 256 and 512 bytes - way over aligned. + if ((T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64) && + (T.isOSAIX() || T.isOSLinux())) + Ret += "-v256:256:256-v512:512:512"; + return Ret; } Index: llvm/test/CodeGen/PowerPC/mma-acc-memops.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/PowerPC/mma-acc-memops.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=BE + +@f = common local_unnamed_addr global <512 x i1> zeroinitializer, align 16 + +define void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { +; LE-LABEL: testLdSt: +; LE: # %bb.0: # %entry +; LE-NEXT: plxv vs1, f@PCREL+96(0), 1 +; LE-NEXT: plxv vs0, f@PCREL+112(0), 1 +; LE-NEXT: plxv vs3, f@PCREL+64(0), 1 +; LE-NEXT: plxv vs2, f@PCREL+80(0), 1 +; LE-NEXT: xxmtacc acc0 +; LE-NEXT: xxmfacc acc0 +; LE-NEXT: pstxv vs0, f@PCREL+176(0), 1 +; LE-NEXT: pstxv vs1, f@PCREL+160(0), 1 +; LE-NEXT: pstxv vs2, f@PCREL+144(0), 1 +; LE-NEXT: pstxv vs3, f@PCREL+128(0), 1 +; LE-NEXT: blr +; +; BE-LABEL: testLdSt: +; BE: # %bb.0: # %entry +; BE-NEXT: addis r3, r2, .LC0@toc@ha +; BE-NEXT: ld r3, .LC0@toc@l(r3) +; BE-NEXT: lxv vs1, 80(r3) +; BE-NEXT: lxv vs0, 64(r3) +; BE-NEXT: lxv vs3, 112(r3) +; BE-NEXT: lxv vs2, 96(r3) +; BE-NEXT: xxmtacc acc0 +; BE-NEXT: xxmfacc acc0 +; BE-NEXT: stxv vs1, 144(r3) +; BE-NEXT: stxv vs0, 128(r3) +; BE-NEXT: stxv vs3, 176(r3) +; BE-NEXT: stxv vs2, 160(r3) +; BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 1 + %0 = load <512 x i1>, <512 x i1>* %arrayidx, align 16 + %arrayidx1 = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 2 + store <512 x i1> %0, <512 x i1>* %arrayidx1, align 16 + ret void +} + +define void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { +; LE-LABEL: testXLdSt: +; LE: # %bb.0: # %entry +; LE-NEXT: sldi r3, r3, 6 +; LE-NEXT: paddi r5, 0, f@PCREL, 1 +; LE-NEXT: add r6, r5, r3 +; LE-NEXT: lxv vs1, 32(r6) +; LE-NEXT: lxv vs0, 48(r6) +; LE-NEXT: lxvx vs3, r5, r3 +; LE-NEXT: lxv vs2, 16(r6) +; LE-NEXT: sldi r3, r4, 6 +; LE-NEXT: xxmtacc acc0 +; LE-NEXT: add r4, r5, r3 +; LE-NEXT: xxmfacc acc0 +; LE-NEXT: stxvx vs3, r5, r3 +; LE-NEXT: stxv vs0, 48(r4) +; LE-NEXT: stxv vs1, 32(r4) +; LE-NEXT: stxv vs2, 16(r4) +; LE-NEXT: blr +; +; BE-LABEL: testXLdSt: +; BE: # %bb.0: # %entry +; BE-NEXT: addis r5, r2, .LC0@toc@ha +; BE-NEXT: ld r5, .LC0@toc@l(r5) +; BE-NEXT: sldi r3, r3, 6 +; BE-NEXT: add r6, r5, r3 +; BE-NEXT: lxvx vs0, r5, r3 +; BE-NEXT: lxv vs1, 16(r6) +; BE-NEXT: lxv vs3, 48(r6) +; BE-NEXT: lxv vs2, 32(r6) +; BE-NEXT: sldi r3, r4, 6 +; BE-NEXT: xxmtacc acc0 +; BE-NEXT: add r4, r5, r3 +; BE-NEXT: xxmfacc acc0 +; BE-NEXT: stxvx vs0, r5, r3 +; BE-NEXT: stxv vs1, 16(r4) +; BE-NEXT: stxv vs3, 48(r4) +; BE-NEXT: stxv vs2, 32(r4) +; BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 %SrcIdx + %0 = load <512 x i1>, <512 x i1>* %arrayidx, align 16 + %arrayidx1 = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 %DstIdx + store <512 x i1> %0, <512 x i1>* %arrayidx1, align 16 + ret void +} + +define void @testUnalignedLdSt() { +; LE-LABEL: testUnalignedLdSt: +; LE: # %bb.0: # %entry +; LE-NEXT: plxv vs1, f@PCREL+43(0), 1 +; LE-NEXT: plxv vs0, f@PCREL+59(0), 1 +; LE-NEXT: plxv vs3, f@PCREL+11(0), 1 +; LE-NEXT: plxv vs2, f@PCREL+27(0), 1 +; LE-NEXT: xxmtacc acc0 +; LE-NEXT: xxmfacc acc0 +; LE-NEXT: pstxv vs0, f@PCREL+67(0), 1 +; LE-NEXT: pstxv vs1, f@PCREL+51(0), 1 +; LE-NEXT: pstxv vs2, f@PCREL+35(0), 1 +; LE-NEXT: pstxv vs3, f@PCREL+19(0), 1 +; LE-NEXT: blr +; +; BE-LABEL: testUnalignedLdSt: +; BE: # %bb.0: # %entry +; BE-NEXT: addis r3, r2, .LC0@toc@ha +; BE-NEXT: ld r3, .LC0@toc@l(r3) +; BE-NEXT: li r4, 11 +; BE-NEXT: lxvx vs0, r3, r4 +; BE-NEXT: li r4, 27 +; BE-NEXT: lxvx vs1, r3, r4 +; BE-NEXT: li r4, 43 +; BE-NEXT: lxvx vs2, r3, r4 +; BE-NEXT: li r4, 59 +; BE-NEXT: lxvx vs3, r3, r4 +; BE-NEXT: li r4, 35 +; BE-NEXT: xxmtacc acc0 +; BE-NEXT: xxmfacc acc0 +; BE-NEXT: stxvx vs1, r3, r4 +; BE-NEXT: li r4, 19 +; BE-NEXT: stxvx vs0, r3, r4 +; BE-NEXT: li r4, 67 +; BE-NEXT: stxvx vs3, r3, r4 +; BE-NEXT: li r4, 51 +; BE-NEXT: stxvx vs2, r3, r4 +; BE-NEXT: blr +entry: + %0 = bitcast <512 x i1>* @f to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 11 + %add.ptr1 = getelementptr inbounds i8, i8* %0, i64 19 + %1 = bitcast i8* %add.ptr to <512 x i1>* + %2 = bitcast i8* %add.ptr1 to <512 x i1>* + %3 = load <512 x i1>, <512 x i1>* %1, align 1 + store <512 x i1> %3, <512 x i1>* %2, align 1 + ret void +}