diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -123,6 +123,8 @@ SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; /// } Custom Lower /// Replace the results of node with an illegal result diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -73,6 +73,8 @@ static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64, MVT::v256f32, MVT::v512f32, MVT::v256f64}; +static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32}; + void VETargetLowering::initRegisterClasses() { // Set up the register classes. addRegisterClass(MVT::i32, &VE::I32RegClass); @@ -288,6 +290,14 @@ #define ADD_VVP_OP(VVP_NAME, ISD_NAME) \ setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom); #include "VVPNodes.def" + + setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal); + } + + for (MVT LegalPackedVT : AllPackedVTs) { + setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom); } } @@ -1534,6 +1544,29 @@ MachinePointerInfo()); } +static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) { + if (!isa(N)) + return false; + const auto *BVN = cast(N); + + // Find first non-undef insertion. + unsigned Idx; + for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) { + auto ElemV = BVN->getOperand(Idx); + if (!ElemV->isUndef()) + break; + } + // Remember insertion. + UniqueIdx = Idx++; + // Verify that all other insertions are undef. + for (; Idx < BVN->getNumOperands(); ++Idx) { + auto ElemV = BVN->getOperand(Idx); + if (!ElemV->isUndef()) + return false; + } + return true; +} + static SDValue getSplatValue(SDNode *N) { if (auto *BuildVec = dyn_cast(N)) { return BuildVec->getSplatValue(); @@ -1547,6 +1580,17 @@ unsigned NumEls = Op.getValueType().getVectorNumElements(); MVT ElemVT = Op.getSimpleValueType().getVectorElementType(); + // If there is just one element, expand to INSERT_VECTOR_ELT. + unsigned UniqueIdx; + if (getUniqueInsertion(Op.getNode(), UniqueIdx)) { + SDValue AccuV = DAG.getUNDEF(Op.getValueType()); + auto ElemV = Op->getOperand(UniqueIdx); + SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV, + ElemV, IdxV); + } + + // Else emit a broadcast. if (SDValue ScalarV = getSplatValue(Op.getNode())) { // lower to VEC_BROADCAST MVT LegalResVT = MVT::getVectorVT(ElemVT, 256); @@ -1556,8 +1600,20 @@ AVL); } - // Expand - return SDValue(); + // Manually expand to a cascade of insertelts. + // FIXME: Else, isel will expand this to vector mem ops, which aren't + // implemented yet. + SDValue AccuV = DAG.getUNDEF(Op.getValueType()); + const auto *BVN = cast(Op); + for (unsigned Idx = 0; Idx < BVN->getNumOperands(); ++Idx) { + auto ElemV = BVN->getOperand(0); + if (ElemV->isUndef()) + continue; + SDValue IdxV = DAG.getConstant(Idx, DL, MVT::i64); + AccuV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV, + ElemV, IdxV); + } + return AccuV; } SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -1595,6 +1651,11 @@ case ISD::VAARG: return lowerVAARG(Op, DAG); + case ISD::INSERT_VECTOR_ELT: + return lowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: + return lowerEXTRACT_VECTOR_ELT(Op, DAG); + #define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME: #include "VVPNodes.def" return lowerToVVP(Op, DAG); @@ -1923,3 +1984,112 @@ } llvm_unreachable("lowerToVVP called for unexpected SDNode."); } + +SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); + MVT VT = Op.getOperand(0).getSimpleValueType(); + + // Special treatment for packed V64 types. + assert(VT == MVT::v512i32 || VT == MVT::v512f32); + // Example of codes: + // %packed_v = extractelt %vr, %idx / 2 + // %v = %packed_v >> (%idx % 2 * 32) + // %res = %v & 0xffffffff + + SDValue Vec = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + SDLoc DL(Op); + SDValue Result = Op; + if (0 /* Idx->isConstant() */) { + // TODO: optimized implementation using constant values + } else { + SDValue SetEq = DAG.getCondCode(ISD::SETEQ); + SDValue ZeroConst = DAG.getConstant(0, DL, MVT::i64); + SDValue OneConst = DAG.getConstant(1, DL, MVT::i64); + SDValue ThirtyTwoConst = DAG.getConstant(32, DL, MVT::i64); + SDValue LowBits = DAG.getConstant(0xFFFFFFFF, DL, MVT::i64); + SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, OneConst}); + SDValue PackedVal = + SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0); + SDValue IdxLSB = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, OneConst}); + SDValue ShiftIdx = + DAG.getNode(ISD::SELECT_CC, DL, MVT::i64, + {IdxLSB, ZeroConst, ZeroConst, ThirtyTwoConst, SetEq}); + SDValue ShiftedVal = + DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedVal, ShiftIdx}); + SDValue MaskedVal = + DAG.getNode(ISD::AND, DL, MVT::i64, {ShiftedVal, LowBits}); + // In v512i32 and v512f32, Both i32 and f32 values are placed in i32 subreg. + SDValue SubLow32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32); + Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::i32, MaskedVal, SubLow32), + 0); + if (VT == MVT::v512f32) { + Result = DAG.getBitcast(MVT::f32, Result); + } + } + return Result; +} + +SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); + MVT VT = Op.getOperand(0).getSimpleValueType(); + + // Special treatment for packed V64 types. + assert(VT == MVT::v512i32 || VT == MVT::v512f32); + // Example of codes: + // %packed_v = extractelt %vr, %idx / 2 + // %packed_v &= 0xffffffff << ((%idx % 2) ? 0 : 32) + // %packed_v |= %val << (%idx % 2 * 32) + // %vr = insertelt %vr, %packed_v, %idx / 2 + + SDValue Vec = Op.getOperand(0); + SDValue Val = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + SDLoc DL(Op); + // In v512i32 and v512f32, both i32 and f32 values are placed from Low32, + // therefore convert f32 to i32 first. + SDValue I32Val = Val; + if (VT == MVT::v512f32) { + I32Val = DAG.getBitcast(MVT::i32, Val); + } + SDValue Result = Op; + if (0 /* Idx->isConstant()*/) { + // TODO: optimized implementation using constant values + } else { + SDValue SetEq = DAG.getCondCode(ISD::SETEQ); + // SDValue CcEq = DAG.getConstant(VECC::CC_IEQ, DL, i64); + SDValue ZeroConst = DAG.getConstant(0, DL, MVT::i64); + SDValue OneConst = DAG.getConstant(1, DL, MVT::i64); + SDValue ThirtyTwoConst = DAG.getConstant(32, DL, MVT::i64); + SDValue HighMask = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64); + SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, OneConst}); + SDValue PackedVal = + SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0); + SDValue IdxLSB = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, OneConst}); + SDValue ShiftIdx = + DAG.getNode(ISD::SELECT_CC, DL, MVT::i64, + {IdxLSB, ZeroConst, ZeroConst, ThirtyTwoConst, SetEq}); + SDValue Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {HighMask, ShiftIdx}); + SDValue MaskedVal = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedVal, Mask}); + SDValue BaseVal = SDValue( + DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0); + // In v512i32 and v512f32, Both i32 and f32 values are placed in i32 subreg. + SDValue SubLow32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32); + SDValue I64Val = + SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i64, + BaseVal, I32Val, SubLow32), + 0); + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, DL, MVT::i64, {I64Val, ShiftIdx}); + SDValue CombinedVal = + DAG.getNode(ISD::OR, DL, MVT::i64, {ShiftedVal, MaskedVal}); + Result = + SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(), + {HalfIdx, CombinedVal, Vec}), + 0); + } + return Result; +} diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -1558,6 +1558,17 @@ // Pattern Matchings //===----------------------------------------------------------------------===// +// Basic cast between registers. This is often used in ISel patterns, so make +// them as OutPatFrag. +def i2l : OutPatFrag<(ops node:$exp), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_i32)>; +def l2i : OutPatFrag<(ops node:$exp), + (EXTRACT_SUBREG $exp, sub_i32)>; +def f2l : OutPatFrag<(ops node:$exp), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_f32)>; +def l2f : OutPatFrag<(ops node:$exp), + (EXTRACT_SUBREG $exp, sub_f32)>; + // Small immediates. def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>; def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>; @@ -1771,9 +1782,6 @@ defm : ATMLDm; defm : ATMLDm; -def i2l : OutPatFrag<(ops node:$exp), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_i32)>; - // Optimized atomic loads with sext multiclass SXATMLDm { +multiclass vbrd_elem32 { // VBRDil def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)), (VBRDil (ImmCast $sy), i32:$vl)>; @@ -23,13 +23,10 @@ // VBRDrl def : Pat<(v32 (vec_broadcast s32:$sy, i32:$vl)), (VBRDrl - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $sy, SubRegIdx), + (SuperRegCast $sy), i32:$vl)>; } -defm : vbrd_elem32; -defm : vbrd_elem32; - multiclass vbrd_elem64 { // VBRDil def : Pat<(v64 (vec_broadcast (s64 ImmOp:$sy), i32:$vl)), @@ -40,5 +37,50 @@ (VBRDrl s64:$sy, i32:$vl)>; } -defm : vbrd_elem64; -defm : vbrd_elem64; +multiclass extract_insert_elem32 { + // LVSvi + def: Pat<(s32 (extractelt v32:$vec, uimm7:$idx)), + (SubRegCast (LVSvi v32:$vec, (ULO7 $idx)))>; + // LVSvr + def: Pat<(s32 (extractelt v32:$vec, i64:$idx)), + (SubRegCast (LVSvr v32:$vec, $idx))>; + + // LSVir + def: Pat<(v32 (insertelt v32:$vec, s32:$val, uimm7:$idx)), + (LSVir_v (ULO7 $idx), (SuperRegCast $val), $vec)>; + // LSVrr + def: Pat<(v32 (insertelt v32:$vec, s32:$val, i64:$idx)), + (LSVrr_v $idx, (SuperRegCast $val), $vec)>; +} + +multiclass extract_insert_elem64 { + // LVSvi + def: Pat<(s64 (extractelt v64:$vec, uimm7:$idx)), + (LVSvi v64:$vec, (ULO7 $idx))>; + // LVSvr + def: Pat<(s64 (extractelt v64:$vec, i64:$idx)), + (LVSvr v64:$vec, $idx)>; + + // LSVir + def: Pat<(v64 (insertelt v64:$vec, s64:$val, uimm7:$idx)), + (LSVir_v (ULO7 $idx), $val, $vec)>; + // LSVrr + def: Pat<(v64 (insertelt v64:$vec, s64:$val, i64:$idx)), + (LSVrr_v $idx, $val, $vec)>; +} + +multiclass patterns_elem32 { + defm : vbrd_elem32; + defm: extract_insert_elem32; +} + +multiclass patterns_elem64 { + defm : vbrd_elem64; + defm : extract_insert_elem64; +} + +defm : patterns_elem32; +defm : patterns_elem32; + +defm : patterns_elem64; +defm : patterns_elem64; diff --git a/llvm/test/CodeGen/VE/Vector/extract_elt.ll b/llvm/test/CodeGen/VE/Vector/extract_elt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/extract_elt.ll @@ -0,0 +1,167 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + + +;;; <256 x i64> + +define fastcc i64 @extract_rr_v256i64(i32 %idx, <256 x i64> %v) { +; CHECK-LABEL: extract_rr_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i64> %v, i32 %idx + ret i64 %ret +} + +define fastcc i64 @extract_ri7_v256i64(<256 x i64> %v) { +; CHECK-LABEL: extract_ri7_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i64> %v, i32 127 + ret i64 %ret +} + +define fastcc i64 @extract_ri8_v256i64(<256 x i64> %v) { +; CHECK-LABEL: extract_ri8_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i64> %v, i32 128 + ret i64 %ret +} + +define fastcc i64 @extract_ri_v512i64(<512 x i64> %v) { +; CHECK-LABEL: extract_ri_v512i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v1(116) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x i64> %v, i32 372 + ret i64 %ret +} + +;;; <256 x i32> + +define fastcc i32 @extract_rr_v256i32(i32 %idx, <256 x i32> %v) { +; CHECK-LABEL: extract_rr_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i32> %v, i32 %idx + ret i32 %ret +} + +define fastcc i32 @extract_ri7_v256i32(<256 x i32> %v) { +; CHECK-LABEL: extract_ri7_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i32> %v, i32 127 + ret i32 %ret +} + +define fastcc i32 @extract_ri8_v256i32(<256 x i32> %v) { +; CHECK-LABEL: extract_ri8_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i32> %v, i32 128 + ret i32 %ret +} + +define fastcc i32 @extract_ri_v512i32(<512 x i32> %v) { +; CHECK-LABEL: extract_ri_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 186 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x i32> %v, i32 372 + ret i32 %ret +} + +;;; <256 x double> + +define fastcc double @extract_rr_v256f64(i32 %idx, <256 x double> %v) { +; CHECK-LABEL: extract_rr_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x double> %v, i32 %idx + ret double %ret +} + +define fastcc double @extract_ri7_v256f64(<256 x double> %v) { +; CHECK-LABEL: extract_ri7_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x double> %v, i32 127 + ret double %ret +} + +define fastcc double @extract_ri8_v256f64(<256 x double> %v) { +; CHECK-LABEL: extract_ri8_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x double> %v, i32 128 + ret double %ret +} + +define fastcc double @extract_ri_v512f64(<512 x double> %v) { +; CHECK-LABEL: extract_ri_v512f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v1(116) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x double> %v, i32 372 + ret double %ret +} + +;;; <256 x float> + +define fastcc float @extract_rr_v256f32(i32 %idx, <256 x float> %v) { +; CHECK-LABEL: extract_rr_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x float> %v, i32 %idx + ret float %ret +} + +define fastcc float @extract_ri7_v256f32(<256 x float> %v) { +; CHECK-LABEL: extract_ri7_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x float> %v, i32 127 + ret float %ret +} + +define fastcc float @extract_ri8_v256f32(<256 x float> %v) { +; CHECK-LABEL: extract_ri8_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x float> %v, i32 128 + ret float %ret +} + +define fastcc float @extract_ri_v512f32(<512 x float> %v) { +; CHECK-LABEL: extract_ri_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 186 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: sll %s0, %s0, 32 +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x float> %v, i32 372 + ret float %ret +} diff --git a/llvm/test/CodeGen/VE/Vector/insert_elt.ll b/llvm/test/CodeGen/VE/Vector/insert_elt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/insert_elt.ll @@ -0,0 +1,175 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + + +;;; <256 x i64> + +define fastcc <256 x i64> @insert_rr_v256i64(i32 %idx, i64 %s) { +; CHECK-LABEL: insert_rr_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i64> undef, i64 %s, i32 %idx + ret <256 x i64> %ret +} + +define fastcc <256 x i64> @insert_ri7_v256i64(i64 %s) { +; CHECK-LABEL: insert_ri7_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i64> undef, i64 %s, i32 127 + ret <256 x i64> %ret +} + +define fastcc <256 x i64> @insert_ri8_v256i64(i64 %s) { +; CHECK-LABEL: insert_ri8_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i64> undef, i64 %s, i32 128 + ret <256 x i64> %ret +} + +define fastcc <512 x i64> @insert_ri_v512i64(i64 %s) { +; CHECK-LABEL: insert_ri_v512i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v1(116), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x i64> undef, i64 %s, i32 372 + ret <512 x i64> %ret +} + +;;; <256 x i32> + +define fastcc <256 x i32> @insert_rr_v256i32(i32 %idx, i32 %s) { +; CHECK-LABEL: insert_rr_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i32> undef, i32 %s, i32 %idx + ret <256 x i32> %ret +} + +define fastcc <256 x i32> @insert_ri7_v256i32(i32 %s) { +; CHECK-LABEL: insert_ri7_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i32> undef, i32 %s, i32 127 + ret <256 x i32> %ret +} + +define fastcc <256 x i32> @insert_ri8_v256i32(i32 %s) { +; CHECK-LABEL: insert_ri8_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i32> undef, i32 %s, i32 128 + ret <256 x i32> %ret +} + +define fastcc <512 x i32> @insert_ri_v512i32(i32 %s) { +; CHECK-LABEL: insert_ri_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea %s1, 186 +; CHECK-NEXT: lvs %s2, %v0(%s1) +; CHECK-NEXT: and %s2, %s2, (32)1 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x i32> undef, i32 %s, i32 372 + ret <512 x i32> %ret +} + +;;; <256 x double> + +define fastcc <256 x double> @insert_rr_v256f64(i32 %idx, double %s) { +; CHECK-LABEL: insert_rr_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x double> undef, double %s, i32 %idx + ret <256 x double> %ret +} + +define fastcc <256 x double> @insert_ri7_v256f64(double %s) { +; CHECK-LABEL: insert_ri7_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x double> undef, double %s, i32 127 + ret <256 x double> %ret +} + +define fastcc <256 x double> @insert_ri8_v256f64(double %s) { +; CHECK-LABEL: insert_ri8_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x double> undef, double %s, i32 128 + ret <256 x double> %ret +} + +define fastcc <512 x double> @insert_ri_v512f64(double %s) { +; CHECK-LABEL: insert_ri_v512f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v1(116), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x double> undef, double %s, i32 372 + ret <512 x double> %ret +} + +;;; <256 x float> + +define fastcc <256 x float> @insert_rr_v256f32(i32 %idx, float %s) { +; CHECK-LABEL: insert_rr_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x float> undef, float %s, i32 %idx + ret <256 x float> %ret +} + +define fastcc <256 x float> @insert_ri7_v256f32(float %s) { +; CHECK-LABEL: insert_ri7_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x float> undef, float %s, i32 127 + ret <256 x float> %ret +} + +define fastcc <256 x float> @insert_ri8_v256f32(float %s) { +; CHECK-LABEL: insert_ri8_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x float> undef, float %s, i32 128 + ret <256 x float> %ret +} + +define fastcc <512 x float> @insert_ri_v512f32(float %s) { +; CHECK-LABEL: insert_ri_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: sra.l %s0, %s0, 32 +; CHECK-NEXT: lea %s1, 186 +; CHECK-NEXT: lvs %s2, %v0(%s1) +; CHECK-NEXT: and %s2, %s2, (32)1 +; CHECK-NEXT: or %s0, %s0, %s2 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x float> undef, float %s, i32 372 + ret <512 x float> %ret +}