diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1041,6 +1041,14 @@ return false; } + // Use this to bypass the builtin legalization decisions for EVTs. The builtin + // scheme may lead to undesirable results (eg power-of-two-padding or + // scalarization) for EVT-typed nodes (eg v7f16). + virtual Optional getCustomTypeConversion(LLVMContext &Context, + EVT VT) const { + return None; + } + /// Return how this operation should be treated: either it is legal, needs to /// be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -422,6 +422,27 @@ return Val; } + // Vector/Vector bitcast. + if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) + return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + + if (ValueVT.isScalableVector()) { + assert(PartEVT.getVectorElementCount() == + ValueVT.getVectorElementCount()); + // Promote or truncate. + return DAG.getAnyExtOrTrunc(Val, DL, ValueVT); + } + + // Shorten and promote. + assert(PartEVT.getVectorNumElements() >= ValueVT.getVectorNumElements()); + if (PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) { + EVT ClippedVT = + EVT::getVectorVT(*DAG.getContext(), PartEVT.getVectorElementType(), + ValueVT.getVectorNumElements()); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ClippedVT, Val, + DAG.getVectorIdxConstant(0, DL)); + } + // Promoted vector extract return DAG.getAnyExtOrTrunc(Val, DL, ValueVT); } @@ -617,21 +638,36 @@ EVT ValueVT = Val.getValueType(); ElementCount PartNumElts = PartVT.getVectorElementCount(); ElementCount ValueNumElts = ValueVT.getVectorElementCount(); + bool ElementMismatch = PartVT.getVectorElementType() != ValueVT.getVectorElementType(); // We only support widening vectors with equivalent element types and // fixed/scalable properties. If a target needs to widen a fixed-length type // to a scalable one, it should be possible to use INSERT_SUBVECTOR below. if (ElementCount::isKnownLE(PartNumElts, ValueNumElts) || - PartNumElts.isScalable() != ValueNumElts.isScalable() || - PartVT.getVectorElementType() != ValueVT.getVectorElementType()) + PartNumElts.isScalable() != ValueNumElts.isScalable()) return SDValue(); // Widening a scalable vector to another scalable vector is done by inserting // the vector into a larger undef one. - if (PartNumElts.isScalable()) + if (PartNumElts.isScalable() && !ElementMismatch) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT), Val, DAG.getVectorIdxConstant(0, DL)); + // Promote, then extract. + if (ElementMismatch) { + // Bail on truncation (information loss). + if (PartVT.getVectorElementType().getScalarSizeInBits() < + ValueVT.getVectorElementType().getScalarSizeInBits()) { + return SDValue(); + } + + // Promote, then extract. + EVT PromotedVT = + EVT::getVectorVT(*DAG.getContext(), PartVT.getVectorElementType(), + ValueVT.getVectorNumElements()); + Val = DAG.getAnyExtOrTrunc(Val, DL, PromotedVT); + } + EVT ElementVT = PartVT.getVectorElementType(); // Vector widening case, e.g. <2 x float> -> <4 x float>. Shuffle in // undef elements. diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -958,6 +958,11 @@ TargetLoweringBase::LegalizeKind TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const { + // Fully customized legalization. + Optional CustomLK = getCustomTypeConversion(Context, VT); + if (CustomLK) + return *CustomLK; + // If this is a simple type, use the ComputeRegisterProp mechanism. if (VT.isSimple()) { MVT SVT = VT.getSimpleVT(); diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -102,7 +102,41 @@ return ISD::ANY_EXTEND; } + /// Custom CC Mapping { + using RegisterCountPair = std::pair; + // Map all vector EVTs to vector or vector mask registers. + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, + EVT VT) const override { + auto Opt = getRegistersForCallingConv(Context, CC, VT); + if (!Opt) + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); + return Opt->first; + } + + unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override { + auto Opt = getRegistersForCallingConv(Context, CC, VT); + if (!Opt) + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); + return Opt->second; + } + + Optional getRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const; + + unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT, + EVT &IntermediateVT, + unsigned &NumIntermediates, + MVT &RegisterVT) const override; + /// } Custom CC Mapping + /// Custom Lower { + + Optional getCustomTypeConversion(LLVMContext &Context, + EVT VT) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; unsigned getJumpTableEncoding() const override; const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -2835,3 +2835,186 @@ } return Result; } + +static bool isPackableElemVT(EVT VT) { + if (VT.isVector()) + return false; + return VT.getScalarSizeInBits() <= 32; +} + +static bool isVectorRegisterVT(EVT VT) { + if (!VT.isVector() || VT.isScalableVector()) + return false; + unsigned NumElems = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + + // Not a legal element count. + if ((NumElems != 256) && (NumElems != 512)) + return false; + + // Legal as both regular and packed vectors. + if (ElemVT == MVT::i1 || ElemVT == MVT::i32 || ElemVT == MVT::f32) + return true; + + // Only legal in regular mode. + return NumElems == 256; +} + +static TargetLoweringBase::LegalizeKind +getPromoteElementConversion(LLVMContext &Context, EVT ElemVT, + unsigned NumElems) { + using LegalizeKind = TargetLoweringBase::LegalizeKind; + using LegalizeTypeAction = TargetLoweringBase::LegalizeTypeAction; + + LegalizeTypeAction LTA; + MVT PromotedElemVT; + if (ElemVT.isFloatingPoint()) { + PromotedElemVT = MVT::f32; + LTA = LegalizeTypeAction::TypePromoteFloat; + } else { + assert(ElemVT.isInteger()); + PromotedElemVT = MVT::i32; + LTA = LegalizeTypeAction::TypePromoteInteger; + } + return LegalizeKind(LTA, EVT::getVectorVT(Context, PromotedElemVT, NumElems)); +} + +static TargetLoweringBase::LegalizeKind +getWidenVectorConversion(LLVMContext &Context, EVT ElemVT, + unsigned LegalNumElems) { + using LegalizeKind = TargetLoweringBase::LegalizeKind; + using LegalizeTypeAction = TargetLoweringBase::LegalizeTypeAction; + + return LegalizeKind(LegalizeTypeAction::TypeWidenVector, + EVT::getVectorVT(Context, ElemVT, LegalNumElems)); +} + +static TargetLoweringBase::LegalizeKind +getSplitVectorConversion(LLVMContext &Context, EVT ElemVT, unsigned NumElems) { + using LegalizeKind = TargetLoweringBase::LegalizeKind; + using LegalizeTypeAction = TargetLoweringBase::LegalizeTypeAction; + + return LegalizeKind(LegalizeTypeAction::TypeSplitVector, + EVT::getVectorVT(Context, ElemVT, (NumElems + 1) / 2)); +} + +Optional +VETargetLowering::getCustomTypeConversion(LLVMContext &Context, EVT VT) const { + // Do not interfere with SPU legalization. + if (!VT.isVector() || !Subtarget->enableVPU() || + VT.getVectorNumElements() == 1) + return None; + + // Already a legal type. + if (isVectorRegisterVT(VT)) + return None; + + // Promote small elements to i/f32. + EVT ElemVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + auto ElemBits = ElemVT.getScalarSizeInBits(); + if (1 < ElemBits && ElemBits < 32) + return getPromoteElementConversion(Context, ElemVT, NumElems); + + // Excessive element size. + if (ElemBits > 64) + return None; // Defer to builtin expansion for oversized vectors. + + // Only use packed mode when surpassing the regular (256 elements) vector + // size. + const bool UsePackedRegister = isPackableElemVT(ElemVT) && NumElems > 256; + + // Widen to register width. + const unsigned RegisterNumElems = UsePackedRegister ? 512 : 256; + if (NumElems < RegisterNumElems) + return getWidenVectorConversion(Context, ElemVT, RegisterNumElems); + + // Split to register width. + // TODO: Teach isel to split non-power-of-two vectors. + if (NumElems > RegisterNumElems && (NumElems % 2 == 0)) + return getSplitVectorConversion(Context, ElemVT, NumElems); + + // Type is either legal or not custom converted. + return None; +} + +Optional +VETargetLowering::getRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const { + using RegisterCount = VETargetLowering::RegisterCountPair; + if (CC != CallingConv::Fast) + return None; + if (!VT.isVector() || VT.isScalableVector()) + return None; + + MVT RegisterVT; + EVT IntermediateVT; + unsigned NumIntermediates; + unsigned NumRegs = getVectorTypeBreakdownForCallingConv( + Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); + return RegisterCount{RegisterVT, NumRegs}; +} + +unsigned VETargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + auto DefaultImpl = [&]() { + return TargetLoweringBase::getVectorTypeBreakdownForCallingConv( + Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); + }; + + if (CC != CallingConv::Fast || VT.isScalableVector() || + isVectorRegisterVT(VT)) + return DefaultImpl(); + + // fastcc - map everything to vregs. + auto LK = getCustomTypeConversion(Context, VT); + // Non-custom converted type - back to builtin logic. + if (!LK) + return DefaultImpl(); + + // Compute the fixed point of the custom type conversion rules. + // We want to have the same vector layout inside functions as well as across + // function boundaries. + + // IntermediateVT : used to copy the parts. + IntermediateVT = VT; + NumIntermediates = 1; + + EVT NextVT; + do { + NextVT = LK->second; + auto LTA = LK->first; + + switch (LTA) { + default: + return DefaultImpl(); + + case LegalizeTypeAction::TypePromoteFloat: + case LegalizeTypeAction::TypePromoteInteger: + // Promote elements across call boundaries. + IntermediateVT = NextVT; + break; + + case LegalizeTypeAction::TypeWidenVector: + // Retain all information about the original vector length. + // That is, keep the IntermediateVT at the original vector length if + // possible + break; + + case LegalizeTypeAction::TypeSplitVector: + // The last split results in the intermediate VT used for copying vectors + // at calls. + IntermediateVT = NextVT; + NumIntermediates *= 2; + break; + } + + LK = getCustomTypeConversion(Context, NextVT); + } while (LK); + + RegisterVT = NextVT.getSimpleVT(); + + // Must converge in a valid RegisterVT. + return NumIntermediates; +} diff --git a/llvm/test/CodeGen/VE/Vector/fastcc_callee.ll b/llvm/test/CodeGen/VE/Vector/fastcc_callee.ll --- a/llvm/test/CodeGen/VE/Vector/fastcc_callee.ll +++ b/llvm/test/CodeGen/VE/Vector/fastcc_callee.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s ; Scalar argument passing must not change (same tests as in VE/Scalar/callee.ll below - this time with +vpu) @@ -145,3 +146,322 @@ ; CHECK-NEXT: b.l.t (, %s10) ret <256 x i1> %vm6 } + + +;;; Non-simple vector types. + +;; Expect non-power-of-two vector that fit inside one vector register to be widened. +define fastcc <17 x i64> @vreg_arg_v17i64_r1(<256 x i64> %p0, <17 x i64> %p1) { +; CHECK-LABEL: vreg_arg_v17i64_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: b.l.t (, %s10) + ret <17 x i64> %p1 +} + +define fastcc <17 x i32> @vreg_arg_v17i32_r1(<256 x i32> %p0, <17 x i32> %p1) { +; CHECK-LABEL: vreg_arg_v17i32_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: b.l.t (, %s10) + ret <17 x i32> %p1 +} + +define fastcc <17 x i1> @vm_arg_v17i1_r1(<256 x i1> %p0, <17 x i1> %p1) { +; CHECK-LABEL: vm_arg_v17i1_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: andm %vm1, %vm0, %vm2 +; CHECK-NEXT: b.l.t (, %s10) + ret <17 x i1> %p1 +} + +;; Expect over-sized non-power-of-two vectors to be split(64bit elements) and widened. +define fastcc <334 x i64> @vreg_arg_v334i64_r1(<256 x i64> %p0, <334 x i64> %p1) { +; CHECK-LABEL: vreg_arg_v334i64_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v1, (0)1, %v2 +; CHECK-NEXT: b.l.t (, %s10) + ret <334 x i64> %p1 +} + +define fastcc <334 x i32> @vreg_arg_v334i32_r1(<256 x i32> %p0, <334 x i32> %p1) { +; CHECK-LABEL: vreg_arg_v334i32_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: b.l.t (, %s10) + ret <334 x i32> %p1 +} + +; FIXME: This test documents a bug in cc lowering: +; %p1 should live in 'VMP3' and there should be a copy from that to 'VMP1' here. +define fastcc <334 x i1> @vm_arg_v334i1_r1(<256 x i1> %p0, <334 x i1> %p1) { +; CHECK-LABEL: vm_arg_v334i1_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: b.l.t (, %s10) + ret <334 x i1> %p1 +} + +;; Vectors with over-sized elements. +; TODO: Implement custom element splitting to get this into vregs. +define fastcc <17 x i128> @vreg_arg_v17i128_r1(<256 x i128> %p0, <17 x i128> %p1) { +; CHECK-LABEL: vreg_arg_v17i128_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: ld %s1, 4280(, %s11) +; CHECK-NEXT: ld %s2, 4288(, %s11) +; CHECK-NEXT: ld %s3, 4296(, %s11) +; CHECK-NEXT: ld %s4, 4304(, %s11) +; CHECK-NEXT: ld %s5, 4312(, %s11) +; CHECK-NEXT: ld %s6, 4320(, %s11) +; CHECK-NEXT: ld %s7, 4328(, %s11) +; CHECK-NEXT: ld %s34, 4336(, %s11) +; CHECK-NEXT: ld %s35, 4344(, %s11) +; CHECK-NEXT: ld %s36, 4352(, %s11) +; CHECK-NEXT: ld %s37, 4360(, %s11) +; CHECK-NEXT: ld %s38, 4368(, %s11) +; CHECK-NEXT: ld %s39, 4376(, %s11) +; CHECK-NEXT: ld %s40, 4384(, %s11) +; CHECK-NEXT: ld %s41, 4392(, %s11) +; CHECK-NEXT: ld %s42, 4400(, %s11) +; CHECK-NEXT: ld %s43, 4408(, %s11) +; CHECK-NEXT: ld %s44, 4416(, %s11) +; CHECK-NEXT: ld %s45, 4424(, %s11) +; CHECK-NEXT: ld %s46, 4432(, %s11) +; CHECK-NEXT: ld %s47, 4440(, %s11) +; CHECK-NEXT: ld %s48, 4448(, %s11) +; CHECK-NEXT: ld %s49, 4456(, %s11) +; CHECK-NEXT: ld %s50, 4464(, %s11) +; CHECK-NEXT: ld %s51, 4472(, %s11) +; CHECK-NEXT: ld %s52, 4480(, %s11) +; CHECK-NEXT: ld %s53, 4488(, %s11) +; CHECK-NEXT: ld %s54, 4496(, %s11) +; CHECK-NEXT: ld %s55, 4504(, %s11) +; CHECK-NEXT: ld %s56, 4512(, %s11) +; CHECK-NEXT: ld %s57, 4544(, %s11) +; CHECK-NEXT: ld %s58, 4536(, %s11) +; CHECK-NEXT: ld %s59, 4528(, %s11) +; CHECK-NEXT: ld %s60, 4520(, %s11) +; CHECK-NEXT: st %s57, 264(, %s0) +; CHECK-NEXT: st %s58, 256(, %s0) +; CHECK-NEXT: st %s59, 248(, %s0) +; CHECK-NEXT: st %s60, 240(, %s0) +; CHECK-NEXT: st %s56, 232(, %s0) +; CHECK-NEXT: st %s55, 224(, %s0) +; CHECK-NEXT: st %s54, 216(, %s0) +; CHECK-NEXT: st %s53, 208(, %s0) +; CHECK-NEXT: st %s52, 200(, %s0) +; CHECK-NEXT: st %s51, 192(, %s0) +; CHECK-NEXT: st %s50, 184(, %s0) +; CHECK-NEXT: st %s49, 176(, %s0) +; CHECK-NEXT: st %s48, 168(, %s0) +; CHECK-NEXT: st %s47, 160(, %s0) +; CHECK-NEXT: st %s46, 152(, %s0) +; CHECK-NEXT: st %s45, 144(, %s0) +; CHECK-NEXT: st %s44, 136(, %s0) +; CHECK-NEXT: st %s43, 128(, %s0) +; CHECK-NEXT: st %s42, 120(, %s0) +; CHECK-NEXT: st %s41, 112(, %s0) +; CHECK-NEXT: st %s40, 104(, %s0) +; CHECK-NEXT: st %s39, 96(, %s0) +; CHECK-NEXT: st %s38, 88(, %s0) +; CHECK-NEXT: st %s37, 80(, %s0) +; CHECK-NEXT: st %s36, 72(, %s0) +; CHECK-NEXT: st %s35, 64(, %s0) +; CHECK-NEXT: st %s34, 56(, %s0) +; CHECK-NEXT: st %s7, 48(, %s0) +; CHECK-NEXT: st %s6, 40(, %s0) +; CHECK-NEXT: st %s5, 32(, %s0) +; CHECK-NEXT: st %s4, 24(, %s0) +; CHECK-NEXT: st %s3, 16(, %s0) +; CHECK-NEXT: st %s2, 8(, %s0) +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: b.l.t (, %s10) + ret <17 x i128> %p1 +} + +; TODO: Implement custom element splitting to get this into vregs. +define fastcc <17 x i65> @vreg_arg_v17i65_r1(<256 x i65> %p0, <17 x i65> %p1) { +; CHECK-LABEL: vreg_arg_v17i65_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: ld %s2, 4304(, %s11) +; CHECK-NEXT: ld %s3, 4320(, %s11) +; CHECK-NEXT: ld %s1, 4312(, %s11) +; CHECK-NEXT: ld %s5, 4336(, %s11) +; CHECK-NEXT: ld %s4, 4328(, %s11) +; CHECK-NEXT: ld %s6, 4352(, %s11) +; CHECK-NEXT: ld %s7, 4344(, %s11) +; CHECK-NEXT: ld %s34, 4368(, %s11) +; CHECK-NEXT: ld %s35, 4360(, %s11) +; CHECK-NEXT: ld %s36, 4384(, %s11) +; CHECK-NEXT: ld %s37, 4376(, %s11) +; CHECK-NEXT: ld %s38, 4400(, %s11) +; CHECK-NEXT: ld %s39, 4392(, %s11) +; CHECK-NEXT: ld %s40, 4416(, %s11) +; CHECK-NEXT: ld %s41, 4408(, %s11) +; CHECK-NEXT: ld %s42, 4432(, %s11) +; CHECK-NEXT: ld %s43, 4424(, %s11) +; CHECK-NEXT: ld %s44, 4448(, %s11) +; CHECK-NEXT: ld %s45, 4440(, %s11) +; CHECK-NEXT: ld %s46, 4464(, %s11) +; CHECK-NEXT: ld %s47, 4456(, %s11) +; CHECK-NEXT: ld %s48, 4480(, %s11) +; CHECK-NEXT: ld %s49, 4472(, %s11) +; CHECK-NEXT: ld %s50, 4496(, %s11) +; CHECK-NEXT: ld %s51, 4488(, %s11) +; CHECK-NEXT: ld %s52, 4512(, %s11) +; CHECK-NEXT: ld %s53, 4504(, %s11) +; CHECK-NEXT: ld %s54, 4528(, %s11) +; CHECK-NEXT: ld %s55, 4520(, %s11) +; CHECK-NEXT: ld %s56, 4288(, %s11) +; CHECK-NEXT: ld %s57, 4280(, %s11) +; CHECK-NEXT: ld %s58, 4544(, %s11) +; CHECK-NEXT: ld %s59, 4296(, %s11) +; CHECK-NEXT: ld %s60, 4536(, %s11) +; CHECK-NEXT: st %s57, (, %s0) +; CHECK-NEXT: and %s57, 1, %s58 +; CHECK-NEXT: st1b %s57, 138(, %s0) +; CHECK-NEXT: srl %s57, %s60, 48 +; CHECK-NEXT: st2b %s57, 136(, %s0) +; CHECK-NEXT: sll %s57, %s59, 1 +; CHECK-NEXT: and %s56, 1, %s56 +; CHECK-NEXT: or %s56, %s56, %s57 +; CHECK-NEXT: st %s56, 8(, %s0) +; CHECK-NEXT: srl %s56, %s55, 49 +; CHECK-NEXT: and %s54, 1, %s54 +; CHECK-NEXT: sll %s54, %s54, 15 +; CHECK-NEXT: or %s54, %s56, %s54 +; CHECK-NEXT: sll %s56, %s60, 16 +; CHECK-NEXT: or %s54, %s54, %s56 +; CHECK-NEXT: st %s54, 128(, %s0) +; CHECK-NEXT: srl %s54, %s53, 50 +; CHECK-NEXT: and %s52, 1, %s52 +; CHECK-NEXT: sll %s52, %s52, 14 +; CHECK-NEXT: or %s52, %s52, %s54 +; CHECK-NEXT: sll %s54, %s55, 15 +; CHECK-NEXT: or %s52, %s52, %s54 +; CHECK-NEXT: st %s52, 120(, %s0) +; CHECK-NEXT: srl %s52, %s51, 51 +; CHECK-NEXT: and %s50, 1, %s50 +; CHECK-NEXT: sll %s50, %s50, 13 +; CHECK-NEXT: or %s50, %s52, %s50 +; CHECK-NEXT: sll %s52, %s53, 14 +; CHECK-NEXT: or %s50, %s50, %s52 +; CHECK-NEXT: st %s50, 112(, %s0) +; CHECK-NEXT: srl %s50, %s49, 52 +; CHECK-NEXT: and %s48, 1, %s48 +; CHECK-NEXT: sll %s48, %s48, 12 +; CHECK-NEXT: or %s48, %s48, %s50 +; CHECK-NEXT: sll %s50, %s51, 13 +; CHECK-NEXT: or %s48, %s48, %s50 +; CHECK-NEXT: st %s48, 104(, %s0) +; CHECK-NEXT: srl %s48, %s47, 53 +; CHECK-NEXT: and %s46, 1, %s46 +; CHECK-NEXT: sll %s46, %s46, 11 +; CHECK-NEXT: or %s46, %s48, %s46 +; CHECK-NEXT: sll %s48, %s49, 12 +; CHECK-NEXT: or %s46, %s46, %s48 +; CHECK-NEXT: st %s46, 96(, %s0) +; CHECK-NEXT: srl %s46, %s45, 54 +; CHECK-NEXT: and %s44, 1, %s44 +; CHECK-NEXT: sll %s44, %s44, 10 +; CHECK-NEXT: or %s44, %s44, %s46 +; CHECK-NEXT: sll %s46, %s47, 11 +; CHECK-NEXT: or %s44, %s44, %s46 +; CHECK-NEXT: st %s44, 88(, %s0) +; CHECK-NEXT: srl %s44, %s43, 55 +; CHECK-NEXT: and %s42, 1, %s42 +; CHECK-NEXT: sll %s42, %s42, 9 +; CHECK-NEXT: or %s42, %s44, %s42 +; CHECK-NEXT: sll %s44, %s45, 10 +; CHECK-NEXT: or %s42, %s42, %s44 +; CHECK-NEXT: st %s42, 80(, %s0) +; CHECK-NEXT: srl %s42, %s41, 56 +; CHECK-NEXT: and %s40, 1, %s40 +; CHECK-NEXT: sll %s40, %s40, 8 +; CHECK-NEXT: or %s40, %s40, %s42 +; CHECK-NEXT: sll %s42, %s43, 9 +; CHECK-NEXT: or %s40, %s40, %s42 +; CHECK-NEXT: st %s40, 72(, %s0) +; CHECK-NEXT: srl %s40, %s39, 57 +; CHECK-NEXT: and %s38, 1, %s38 +; CHECK-NEXT: sll %s38, %s38, 7 +; CHECK-NEXT: or %s38, %s40, %s38 +; CHECK-NEXT: sll %s40, %s41, 8 +; CHECK-NEXT: or %s38, %s38, %s40 +; CHECK-NEXT: st %s38, 64(, %s0) +; CHECK-NEXT: srl %s38, %s37, 58 +; CHECK-NEXT: and %s36, 1, %s36 +; CHECK-NEXT: sll %s36, %s36, 6 +; CHECK-NEXT: or %s36, %s36, %s38 +; CHECK-NEXT: sll %s38, %s39, 7 +; CHECK-NEXT: or %s36, %s36, %s38 +; CHECK-NEXT: st %s36, 56(, %s0) +; CHECK-NEXT: srl %s36, %s35, 59 +; CHECK-NEXT: and %s34, 1, %s34 +; CHECK-NEXT: sll %s34, %s34, 5 +; CHECK-NEXT: or %s34, %s36, %s34 +; CHECK-NEXT: sll %s36, %s37, 6 +; CHECK-NEXT: or %s34, %s34, %s36 +; CHECK-NEXT: st %s34, 48(, %s0) +; CHECK-NEXT: srl %s34, %s7, 60 +; CHECK-NEXT: and %s6, 1, %s6 +; CHECK-NEXT: sll %s6, %s6, 4 +; CHECK-NEXT: or %s6, %s6, %s34 +; CHECK-NEXT: sll %s34, %s35, 5 +; CHECK-NEXT: or %s6, %s6, %s34 +; CHECK-NEXT: st %s6, 40(, %s0) +; CHECK-NEXT: srl %s6, %s4, 61 +; CHECK-NEXT: and %s5, 1, %s5 +; CHECK-NEXT: sll %s5, %s5, 3 +; CHECK-NEXT: or %s5, %s6, %s5 +; CHECK-NEXT: sll %s6, %s7, 4 +; CHECK-NEXT: or %s5, %s5, %s6 +; CHECK-NEXT: st %s5, 32(, %s0) +; CHECK-NEXT: srl %s5, %s1, 62 +; CHECK-NEXT: and %s3, 1, %s3 +; CHECK-NEXT: sll %s3, %s3, 2 +; CHECK-NEXT: or %s3, %s3, %s5 +; CHECK-NEXT: sll %s4, %s4, 3 +; CHECK-NEXT: or %s3, %s3, %s4 +; CHECK-NEXT: st %s3, 24(, %s0) +; CHECK-NEXT: srl %s3, %s59, 63 +; CHECK-NEXT: and %s2, 1, %s2 +; CHECK-NEXT: sll %s2, %s2, 1 +; CHECK-NEXT: or %s2, %s3, %s2 +; CHECK-NEXT: sll %s1, %s1, 2 +; CHECK-NEXT: or %s1, %s2, %s1 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: b.l.t (, %s10) + ret <17 x i65> %p1 +} + +;; Vectors with under-sized elements. +define fastcc <17 x i16> @vreg_arg_v17i16_r1(<256 x i16> %p0, <17 x i16> %p1) { +; CHECK-LABEL: vreg_arg_v17i16_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: b.l.t (, %s10) + ret <17 x i16> %p1 +} + +define fastcc <17 x i13> @vreg_arg_v17i13_r1(<256 x i13> %p0, <17 x i13> %p1) { +; CHECK-LABEL: vreg_arg_v17i13_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: b.l.t (, %s10) + ret <17 x i13> %p1 +} diff --git a/llvm/test/CodeGen/VE/Vector/fastcc_caller.ll b/llvm/test/CodeGen/VE/Vector/fastcc_caller.ll --- a/llvm/test/CodeGen/VE/Vector/fastcc_caller.ll +++ b/llvm/test/CodeGen/VE/Vector/fastcc_caller.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s @@ -11,7 +12,22 @@ define fastcc i32 @sample_call() { ; CHECK-LABEL: sample_call: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: lea %s0, sample_add@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, sample_add@hi(, %s0) @@ -19,13 +35,31 @@ ; CHECK-NEXT: or %s1, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %r = tail call fastcc i32 @sample_add(i32 1, i32 2) ret i32 %r } define fastcc i32 @stack_call_int() { ; CHECK-LABEL: stack_call_int: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -256(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: or %s0, 10, (0)1 ; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: or %s34, 9, (0)1 @@ -43,13 +77,31 @@ ; CHECK-NEXT: st %s34, 240(, %s11) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %r = tail call fastcc i32 @stack_callee_int(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10) ret i32 %r } define fastcc i32 @stack_call_int_szext() { ; CHECK-LABEL: stack_call_int_szext: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -256(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB2_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: or %s0, -1, (0)1 ; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: lea %s34, 65535 @@ -67,13 +119,31 @@ ; CHECK-NEXT: st %s34, 240(, %s11) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %r = tail call fastcc i32 @stack_callee_int_szext(i1 -1, i8 -1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i16 -1, i8 -1) ret i32 %r } define fastcc float @stack_call_float() { ; CHECK-LABEL: stack_call_float: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -256(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: lea.sl %s0, 1092616192 ; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: lea.sl %s34, 1091567616 @@ -91,13 +161,31 @@ ; CHECK-NEXT: st %s34, 240(, %s11) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %r = tail call fastcc float @stack_callee_float(float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0) ret float %r } define fastcc float @stack_call_float2(float %p0) { ; CHECK-LABEL: stack_call_float2: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -256(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: lea %s1, stack_callee_float@lo ; CHECK-NEXT: and %s1, %s1, (32)0 @@ -112,32 +200,68 @@ ; CHECK-NEXT: or %s7, 0, %s0 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %r = tail call fastcc float @stack_callee_float(float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0) ret float %r } ; Vector argument passing (fastcc feature) -; +; declare fastcc <256 x i32> @get_v256i32() declare fastcc void @vsample_v(<256 x i32>) declare fastcc void @vsample_iv(i32, <256 x i32>) define void @caller_vret() { -; CHECK: caller_vret: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-LABEL: caller_vret: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB5_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: lea %s0, get_v256i32@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %r = tail call fastcc <256 x i32> @get_v256i32() ret void } define void @caller_vret_pass_p0() { ; CHECK-LABEL: caller_vret_pass_p0: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK: lea %s0, get_v256i32@lo +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: lea %s0, get_v256i32@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) ; CHECK-NEXT: bsic %s10, (, %s12) @@ -146,6 +270,9 @@ ; CHECK-NEXT: lea.sl %s12, vsample_v@hi(, %s0) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %p = tail call fastcc <256 x i32> @get_v256i32() call fastcc void @vsample_v(<256 x i32> %p) ret void @@ -153,8 +280,24 @@ define void @caller_vret_pass_p1(i32 %s) { ; CHECK-LABEL: caller_vret_pass_p1: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK: or %s18, 0, %s0 +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: st %s18, 288(, %s11) # 8-byte Folded Spill +; CHECK-NEXT: or %s18, 0, %s0 ; CHECK-NEXT: lea %s0, get_v256i32@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) @@ -164,6 +307,11 @@ ; CHECK-NEXT: lea.sl %s12, vsample_iv@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s18 ; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: ld %s18, 288(, %s11) # 8-byte Folded Reload +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %p = tail call fastcc <256 x i32> @get_v256i32() call fastcc void @vsample_iv(i32 %s, <256 x i32> %p) ret void @@ -174,7 +322,22 @@ define void @caller_vret_pass_p01() { ; CHECK-LABEL: caller_vret_pass_p01: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB8_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB8_2: ; CHECK-NEXT: lea %s0, get_v256i32@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) @@ -187,6 +350,9 @@ ; CHECK-NEXT: vor %v1, (0)1, %v0 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %p = tail call fastcc <256 x i32> @get_v256i32() call fastcc void @vsample_vv(<256 x i32> %p, <256 x i32> %p) ret void @@ -194,7 +360,22 @@ define void @caller_vret_pass_p012() { ; CHECK-LABEL: caller_vret_pass_p012: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB9_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB9_2: ; CHECK-NEXT: lea %s0, get_v256i32@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) @@ -210,6 +391,9 @@ ; CHECK-NEXT: vor %v2, (0)1, %v0 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) %p = tail call fastcc <256 x i32> @get_v256i32() call fastcc void @vsample_vvv(<256 x i32> %p, <256 x i32> %p, <256 x i32> %p) ret void @@ -221,14 +405,29 @@ ; TODO improve vreg copy (redundant lea+lvl emitted) define fastcc void @roundtrip_caller_callee(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6) { ; CHECK-LABEL: roundtrip_caller_callee: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB10_2: ; CHECK-NEXT: lea %s16, 256 ; CHECK-NEXT: lvl %s16 ; CHECK-NEXT: vor %v7, (0)1, %v0 ; CHECK-NEXT: lea %s0, vsample_vvvvvvv@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, vsample_vvvvvvv@hi(, %s0) -; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lea %s16, 256 ; CHECK-NEXT: lvl %s16 ; CHECK-NEXT: vor %v0, (0)1, %v1 ; CHECK-NEXT: lea %s16, 256 @@ -251,6 +450,466 @@ ; CHECK-NEXT: vor %v6, (0)1, %v7 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) call fastcc void @vsample_vvvvvvv(<256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p0) ret void } + + +;;; Non-simple vector types. + +declare fastcc void @vsample_v17i64(<17 x i64>) + +;; Expect non-power-of-two vector that fit inside one vector register to be widened. +define fastcc void @vreg_arg_v17i64_r1(<256 x i64> %p0, <17 x i64> %p1) { +; CHECK-LABEL: vreg_arg_v17i64_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s0, vsample_v17i64@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v17i64@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v17i64(<17 x i64> %p1) + ret void +} + +declare fastcc void @vsample_v17i32(<17 x i32>) + +;; Expect non-power-of-two vector that fit inside one vector register to be widened. +define fastcc void @vreg_arg_v17i32_r1(<256 x i32> %p0, <17 x i32> %p1) { +; CHECK-LABEL: vreg_arg_v17i32_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s0, vsample_v17i32@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v17i32@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v17i32(<17 x i32> %p1) + ret void +} + +declare fastcc void @vsample_v17i1(<17 x i1>) + +;; Expect non-power-of-two vector that fit inside one vector register to be widened. +define fastcc void @vreg_arg_v17i1_r1(<256 x i1> %p0, <17 x i1> %p1) { +; CHECK-LABEL: vreg_arg_v17i1_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: andm %vm1, %vm0, %vm2 +; CHECK-NEXT: lea %s0, vsample_v17i1@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v17i1@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v17i1(<17 x i1> %p1) + ret void +} + +;; Expect over-sized non-power-of-two vectors to be split(64bit elements) and widened. +declare fastcc void @vsample_v334i64(<334 x i64>) + +define fastcc void @vreg_arg_v334i64_r1(<256 x i64> %p0, <334 x i64> %p1) { +; CHECK-LABEL: vreg_arg_v334i64_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB14_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s0, vsample_v334i64@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v334i64@hi(, %s0) +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v1, (0)1, %v2 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v334i64(<334 x i64> %p1) + ret void +} + +declare fastcc void @vsample_v334i32(<334 x i32>) + +define fastcc void @vreg_arg_v334i32_r1(<256 x i32> %p0, <334 x i32> %p1) { +; CHECK-LABEL: vreg_arg_v334i32_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB15_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s0, vsample_v334i32@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v334i32@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v334i32(<334 x i32> %p1) + ret void +} + +declare fastcc void @vsample_v334i1(<334 x i1>) + +; FIXME: This test documents a bug in cc lowering: +; %p1 should live in 'VMP3' and there should be a copy from that to 'VMP1' here. +define fastcc void @vreg_arg_v334i1_r1(<256 x i1> %p0, <334 x i1> %p1) { +; CHECK-LABEL: vreg_arg_v334i1_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB16_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB16_2: +; CHECK-NEXT: lea %s0, vsample_v334i1@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v334i1@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v334i1(<334 x i1> %p1) + ret void +} + +; TODO: Implement custom element splitting to get this into vregs. +declare fastcc void @vsample_v17i128(<17 x i128>) + +define fastcc void @vreg_arg_v17i128_r1(<256 x i64> %p0, <17 x i128> %p1) { +; CHECK-LABEL: vreg_arg_v17i128_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -448(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB17_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB17_2: +; CHECK-NEXT: ld %s34, 688(, %s11) +; CHECK-NEXT: ld %s35, 696(, %s11) +; CHECK-NEXT: ld %s36, 704(, %s11) +; CHECK-NEXT: ld %s37, 712(, %s11) +; CHECK-NEXT: ld %s38, 720(, %s11) +; CHECK-NEXT: ld %s39, 728(, %s11) +; CHECK-NEXT: ld %s40, 736(, %s11) +; CHECK-NEXT: ld %s41, 744(, %s11) +; CHECK-NEXT: ld %s42, 752(, %s11) +; CHECK-NEXT: ld %s43, 760(, %s11) +; CHECK-NEXT: ld %s44, 768(, %s11) +; CHECK-NEXT: ld %s45, 776(, %s11) +; CHECK-NEXT: ld %s46, 784(, %s11) +; CHECK-NEXT: ld %s47, 792(, %s11) +; CHECK-NEXT: ld %s48, 800(, %s11) +; CHECK-NEXT: ld %s49, 808(, %s11) +; CHECK-NEXT: ld %s50, 816(, %s11) +; CHECK-NEXT: ld %s51, 824(, %s11) +; CHECK-NEXT: ld %s52, 832(, %s11) +; CHECK-NEXT: ld %s53, 840(, %s11) +; CHECK-NEXT: ld %s54, 848(, %s11) +; CHECK-NEXT: ld %s55, 856(, %s11) +; CHECK-NEXT: ld %s56, 864(, %s11) +; CHECK-NEXT: ld %s57, 872(, %s11) +; CHECK-NEXT: ld %s58, 880(, %s11) +; CHECK-NEXT: ld %s59, 888(, %s11) +; CHECK-NEXT: st %s59, 440(, %s11) +; CHECK-NEXT: st %s58, 432(, %s11) +; CHECK-NEXT: st %s57, 424(, %s11) +; CHECK-NEXT: st %s56, 416(, %s11) +; CHECK-NEXT: st %s55, 408(, %s11) +; CHECK-NEXT: st %s54, 400(, %s11) +; CHECK-NEXT: st %s53, 392(, %s11) +; CHECK-NEXT: st %s52, 384(, %s11) +; CHECK-NEXT: st %s51, 376(, %s11) +; CHECK-NEXT: st %s50, 368(, %s11) +; CHECK-NEXT: st %s49, 360(, %s11) +; CHECK-NEXT: st %s48, 352(, %s11) +; CHECK-NEXT: st %s47, 344(, %s11) +; CHECK-NEXT: st %s46, 336(, %s11) +; CHECK-NEXT: st %s45, 328(, %s11) +; CHECK-NEXT: st %s44, 320(, %s11) +; CHECK-NEXT: st %s43, 312(, %s11) +; CHECK-NEXT: st %s42, 304(, %s11) +; CHECK-NEXT: st %s41, 296(, %s11) +; CHECK-NEXT: st %s40, 288(, %s11) +; CHECK-NEXT: st %s39, 280(, %s11) +; CHECK-NEXT: st %s38, 272(, %s11) +; CHECK-NEXT: st %s37, 264(, %s11) +; CHECK-NEXT: st %s36, 256(, %s11) +; CHECK-NEXT: st %s35, 248(, %s11) +; CHECK-NEXT: lea %s35, vsample_v17i128@lo +; CHECK-NEXT: and %s35, %s35, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v17i128@hi(, %s35) +; CHECK-NEXT: st %s34, 240(, %s11) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v17i128(<17 x i128> %p1) + ret void +} + +declare fastcc void @vsample_v17i65(<17 x i65>) + +define fastcc void @vreg_arg_v17i65_r1(<256 x i64> %p0, <17 x i65> %p1) { +; CHECK-LABEL: vreg_arg_v17i65_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -448(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB18_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: ld %s34, 688(, %s11) +; CHECK-NEXT: ld %s35, 696(, %s11) +; CHECK-NEXT: ld %s36, 704(, %s11) +; CHECK-NEXT: ld %s37, 712(, %s11) +; CHECK-NEXT: ld %s38, 720(, %s11) +; CHECK-NEXT: ld %s39, 728(, %s11) +; CHECK-NEXT: ld %s40, 736(, %s11) +; CHECK-NEXT: ld %s41, 744(, %s11) +; CHECK-NEXT: ld %s42, 752(, %s11) +; CHECK-NEXT: ld %s43, 760(, %s11) +; CHECK-NEXT: ld %s44, 768(, %s11) +; CHECK-NEXT: ld %s45, 776(, %s11) +; CHECK-NEXT: ld %s46, 784(, %s11) +; CHECK-NEXT: ld %s47, 792(, %s11) +; CHECK-NEXT: ld %s48, 800(, %s11) +; CHECK-NEXT: ld %s49, 808(, %s11) +; CHECK-NEXT: ld %s50, 816(, %s11) +; CHECK-NEXT: ld %s51, 824(, %s11) +; CHECK-NEXT: ld %s52, 832(, %s11) +; CHECK-NEXT: ld %s53, 840(, %s11) +; CHECK-NEXT: ld %s54, 848(, %s11) +; CHECK-NEXT: ld %s55, 856(, %s11) +; CHECK-NEXT: ld %s56, 864(, %s11) +; CHECK-NEXT: ld %s57, 872(, %s11) +; CHECK-NEXT: ld %s58, 880(, %s11) +; CHECK-NEXT: ld %s59, 888(, %s11) +; CHECK-NEXT: st %s59, 440(, %s11) +; CHECK-NEXT: st %s58, 432(, %s11) +; CHECK-NEXT: st %s57, 424(, %s11) +; CHECK-NEXT: st %s56, 416(, %s11) +; CHECK-NEXT: st %s55, 408(, %s11) +; CHECK-NEXT: st %s54, 400(, %s11) +; CHECK-NEXT: st %s53, 392(, %s11) +; CHECK-NEXT: st %s52, 384(, %s11) +; CHECK-NEXT: st %s51, 376(, %s11) +; CHECK-NEXT: st %s50, 368(, %s11) +; CHECK-NEXT: st %s49, 360(, %s11) +; CHECK-NEXT: st %s48, 352(, %s11) +; CHECK-NEXT: st %s47, 344(, %s11) +; CHECK-NEXT: st %s46, 336(, %s11) +; CHECK-NEXT: st %s45, 328(, %s11) +; CHECK-NEXT: st %s44, 320(, %s11) +; CHECK-NEXT: st %s43, 312(, %s11) +; CHECK-NEXT: st %s42, 304(, %s11) +; CHECK-NEXT: st %s41, 296(, %s11) +; CHECK-NEXT: st %s40, 288(, %s11) +; CHECK-NEXT: st %s39, 280(, %s11) +; CHECK-NEXT: st %s38, 272(, %s11) +; CHECK-NEXT: st %s37, 264(, %s11) +; CHECK-NEXT: st %s36, 256(, %s11) +; CHECK-NEXT: st %s35, 248(, %s11) +; CHECK-NEXT: lea %s35, vsample_v17i65@lo +; CHECK-NEXT: and %s35, %s35, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v17i65@hi(, %s35) +; CHECK-NEXT: st %s34, 240(, %s11) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v17i65(<17 x i65> %p1) + ret void +} + +;; Vectors with under-sized elements. +declare fastcc void @vsample_v17i16(<17 x i16>) + +define fastcc void @vreg_arg_v17i16_r1(<256 x i16> %p0, <17 x i16> %p1) { +; CHECK-LABEL: vreg_arg_v17i16_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB19_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB19_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s0, vsample_v17i16@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v17i16@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v17i16(<17 x i16> %p1) + ret void +} + +declare fastcc void @vsample_v17i13(<17 x i13>) + +define fastcc void @vreg_arg_v17i13_r1(<256 x i13> %p0, <17 x i13> %p1) { +; CHECK-LABEL: vreg_arg_v17i13_r1: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB20_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB20_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s0, vsample_v17i13@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v17i13@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + call fastcc void @vsample_v17i13(<17 x i13> %p1) + ret void +} diff --git a/llvm/test/CodeGen/VE/Vector/vec_add.ll b/llvm/test/CodeGen/VE/Vector/vec_add.ll --- a/llvm/test/CodeGen/VE/Vector/vec_add.ll +++ b/llvm/test/CodeGen/VE/Vector/vec_add.ll @@ -124,7 +124,14 @@ ; Function Attrs: nounwind define fastcc <128 x i16> @add_vv_v128i16(<128 x i16> %x, <128 x i16> %y) { ; CHECK-LABEL: add_vv_v128i16: -; CHECK-NOT: vadd +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lea %s1, 65535 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: pvand.lo %v1, %s1, %v1 +; CHECK-NEXT: pvand.lo %v0, %s1, %v0 +; CHECK-NEXT: vadds.w.sx %v0, %v0, %v1 +; CHECK-NEXT: b.l.t (, %s10) %z = add <128 x i16> %x, %y ret <128 x i16> %z } diff --git a/llvm/test/CodeGen/VE/Vector/vec_and.ll b/llvm/test/CodeGen/VE/Vector/vec_and.ll --- a/llvm/test/CodeGen/VE/Vector/vec_and.ll +++ b/llvm/test/CodeGen/VE/Vector/vec_and.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=ve -mattr=+vpu | FileCheck %s ; <256 x i32> @@ -125,7 +124,13 @@ ; Function Attrs: nounwind define fastcc <128 x i16> @and_vv_v128i16(<128 x i16> %x, <128 x i16> %y) { ; CHECK-LABEL: and_vv_v128i16: -; CHECK-NOT: vand +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: pvand.lo %v0, %v1, %v0 +; CHECK-NEXT: lea %s1, 65535 +; CHECK-NEXT: pvand.lo %v0, %s1, %v0 +; CHECK-NEXT: b.l.t (, %s10) %z = and <128 x i16> %x, %y ret <128 x i16> %z } diff --git a/llvm/test/CodeGen/VE/Vector/vec_broadcast.ll b/llvm/test/CodeGen/VE/Vector/vec_broadcast.ll --- a/llvm/test/CodeGen/VE/Vector/vec_broadcast.ll +++ b/llvm/test/CodeGen/VE/Vector/vec_broadcast.ll @@ -192,135 +192,10 @@ define fastcc <128 x i16> @brd_v128i16(i16 %s) { ; CHECK-LABEL: brd_v128i16: ; CHECK: # %bb.0: -; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: st2b %s1, 254(, %s0) -; CHECK-NEXT: st2b %s1, 252(, %s0) -; CHECK-NEXT: st2b %s1, 250(, %s0) -; CHECK-NEXT: st2b %s1, 248(, %s0) -; CHECK-NEXT: st2b %s1, 246(, %s0) -; CHECK-NEXT: st2b %s1, 244(, %s0) -; CHECK-NEXT: st2b %s1, 242(, %s0) -; CHECK-NEXT: st2b %s1, 240(, %s0) -; CHECK-NEXT: st2b %s1, 238(, %s0) -; CHECK-NEXT: st2b %s1, 236(, %s0) -; CHECK-NEXT: st2b %s1, 234(, %s0) -; CHECK-NEXT: st2b %s1, 232(, %s0) -; CHECK-NEXT: st2b %s1, 230(, %s0) -; CHECK-NEXT: st2b %s1, 228(, %s0) -; CHECK-NEXT: st2b %s1, 226(, %s0) -; CHECK-NEXT: st2b %s1, 224(, %s0) -; CHECK-NEXT: st2b %s1, 222(, %s0) -; CHECK-NEXT: st2b %s1, 220(, %s0) -; CHECK-NEXT: st2b %s1, 218(, %s0) -; CHECK-NEXT: st2b %s1, 216(, %s0) -; CHECK-NEXT: st2b %s1, 214(, %s0) -; CHECK-NEXT: st2b %s1, 212(, %s0) -; CHECK-NEXT: st2b %s1, 210(, %s0) -; CHECK-NEXT: st2b %s1, 208(, %s0) -; CHECK-NEXT: st2b %s1, 206(, %s0) -; CHECK-NEXT: st2b %s1, 204(, %s0) -; CHECK-NEXT: st2b %s1, 202(, %s0) -; CHECK-NEXT: st2b %s1, 200(, %s0) -; CHECK-NEXT: st2b %s1, 198(, %s0) -; CHECK-NEXT: st2b %s1, 196(, %s0) -; CHECK-NEXT: st2b %s1, 194(, %s0) -; CHECK-NEXT: st2b %s1, 192(, %s0) -; CHECK-NEXT: st2b %s1, 190(, %s0) -; CHECK-NEXT: st2b %s1, 188(, %s0) -; CHECK-NEXT: st2b %s1, 186(, %s0) -; CHECK-NEXT: st2b %s1, 184(, %s0) -; CHECK-NEXT: st2b %s1, 182(, %s0) -; CHECK-NEXT: st2b %s1, 180(, %s0) -; CHECK-NEXT: st2b %s1, 178(, %s0) -; CHECK-NEXT: st2b %s1, 176(, %s0) -; CHECK-NEXT: st2b %s1, 174(, %s0) -; CHECK-NEXT: st2b %s1, 172(, %s0) -; CHECK-NEXT: st2b %s1, 170(, %s0) -; CHECK-NEXT: st2b %s1, 168(, %s0) -; CHECK-NEXT: st2b %s1, 166(, %s0) -; CHECK-NEXT: st2b %s1, 164(, %s0) -; CHECK-NEXT: st2b %s1, 162(, %s0) -; CHECK-NEXT: st2b %s1, 160(, %s0) -; CHECK-NEXT: st2b %s1, 158(, %s0) -; CHECK-NEXT: st2b %s1, 156(, %s0) -; CHECK-NEXT: st2b %s1, 154(, %s0) -; CHECK-NEXT: st2b %s1, 152(, %s0) -; CHECK-NEXT: st2b %s1, 150(, %s0) -; CHECK-NEXT: st2b %s1, 148(, %s0) -; CHECK-NEXT: st2b %s1, 146(, %s0) -; CHECK-NEXT: st2b %s1, 144(, %s0) -; CHECK-NEXT: st2b %s1, 142(, %s0) -; CHECK-NEXT: st2b %s1, 140(, %s0) -; CHECK-NEXT: st2b %s1, 138(, %s0) -; CHECK-NEXT: st2b %s1, 136(, %s0) -; CHECK-NEXT: st2b %s1, 134(, %s0) -; CHECK-NEXT: st2b %s1, 132(, %s0) -; CHECK-NEXT: st2b %s1, 130(, %s0) -; CHECK-NEXT: st2b %s1, 128(, %s0) -; CHECK-NEXT: st2b %s1, 126(, %s0) -; CHECK-NEXT: st2b %s1, 124(, %s0) -; CHECK-NEXT: st2b %s1, 122(, %s0) -; CHECK-NEXT: st2b %s1, 120(, %s0) -; CHECK-NEXT: st2b %s1, 118(, %s0) -; CHECK-NEXT: st2b %s1, 116(, %s0) -; CHECK-NEXT: st2b %s1, 114(, %s0) -; CHECK-NEXT: st2b %s1, 112(, %s0) -; CHECK-NEXT: st2b %s1, 110(, %s0) -; CHECK-NEXT: st2b %s1, 108(, %s0) -; CHECK-NEXT: st2b %s1, 106(, %s0) -; CHECK-NEXT: st2b %s1, 104(, %s0) -; CHECK-NEXT: st2b %s1, 102(, %s0) -; CHECK-NEXT: st2b %s1, 100(, %s0) -; CHECK-NEXT: st2b %s1, 98(, %s0) -; CHECK-NEXT: st2b %s1, 96(, %s0) -; CHECK-NEXT: st2b %s1, 94(, %s0) -; CHECK-NEXT: st2b %s1, 92(, %s0) -; CHECK-NEXT: st2b %s1, 90(, %s0) -; CHECK-NEXT: st2b %s1, 88(, %s0) -; CHECK-NEXT: st2b %s1, 86(, %s0) -; CHECK-NEXT: st2b %s1, 84(, %s0) -; CHECK-NEXT: st2b %s1, 82(, %s0) -; CHECK-NEXT: st2b %s1, 80(, %s0) -; CHECK-NEXT: st2b %s1, 78(, %s0) -; CHECK-NEXT: st2b %s1, 76(, %s0) -; CHECK-NEXT: st2b %s1, 74(, %s0) -; CHECK-NEXT: st2b %s1, 72(, %s0) -; CHECK-NEXT: st2b %s1, 70(, %s0) -; CHECK-NEXT: st2b %s1, 68(, %s0) -; CHECK-NEXT: st2b %s1, 66(, %s0) -; CHECK-NEXT: st2b %s1, 64(, %s0) -; CHECK-NEXT: st2b %s1, 62(, %s0) -; CHECK-NEXT: st2b %s1, 60(, %s0) -; CHECK-NEXT: st2b %s1, 58(, %s0) -; CHECK-NEXT: st2b %s1, 56(, %s0) -; CHECK-NEXT: st2b %s1, 54(, %s0) -; CHECK-NEXT: st2b %s1, 52(, %s0) -; CHECK-NEXT: st2b %s1, 50(, %s0) -; CHECK-NEXT: st2b %s1, 48(, %s0) -; CHECK-NEXT: st2b %s1, 46(, %s0) -; CHECK-NEXT: st2b %s1, 44(, %s0) -; CHECK-NEXT: st2b %s1, 42(, %s0) -; CHECK-NEXT: st2b %s1, 40(, %s0) -; CHECK-NEXT: st2b %s1, 38(, %s0) -; CHECK-NEXT: st2b %s1, 36(, %s0) -; CHECK-NEXT: st2b %s1, 34(, %s0) -; CHECK-NEXT: st2b %s1, 32(, %s0) -; CHECK-NEXT: st2b %s1, 30(, %s0) -; CHECK-NEXT: st2b %s1, 28(, %s0) -; CHECK-NEXT: st2b %s1, 26(, %s0) -; CHECK-NEXT: st2b %s1, 24(, %s0) -; CHECK-NEXT: st2b %s1, 22(, %s0) -; CHECK-NEXT: st2b %s1, 20(, %s0) -; CHECK-NEXT: st2b %s1, 18(, %s0) -; CHECK-NEXT: st2b %s1, 16(, %s0) -; CHECK-NEXT: st2b %s1, 14(, %s0) -; CHECK-NEXT: st2b %s1, 12(, %s0) -; CHECK-NEXT: st2b %s1, 10(, %s0) -; CHECK-NEXT: st2b %s1, 8(, %s0) -; CHECK-NEXT: st2b %s1, 6(, %s0) -; CHECK-NEXT: st2b %s1, 4(, %s0) -; CHECK-NEXT: st2b %s1, 2(, %s0) -; CHECK-NEXT: st2b %s1, (, %s0) +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 ; CHECK-NEXT: b.l.t (, %s10) %val = insertelement <128 x i16> undef, i16 %s, i32 0 %ret = shufflevector <128 x i16> %val, <128 x i16> undef, <128 x i32> zeroinitializer