diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -123,6 +123,8 @@ SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; /// } Custom Lower /// Replace the results of node with an illegal result diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -73,6 +73,8 @@ static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64, MVT::v256f32, MVT::v512f32, MVT::v256f64}; +static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32}; + void VETargetLowering::initRegisterClasses() { // Set up the register classes. addRegisterClass(MVT::i32, &VE::I32RegClass); @@ -288,6 +290,14 @@ #define ADD_VVP_OP(VVP_NAME, ISD_NAME) \ setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom); #include "VVPNodes.def" + + setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal); + } + + for (MVT LegalPackedVT : AllPackedVTs) { + setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom); } } @@ -1620,6 +1630,11 @@ case ISD::VAARG: return lowerVAARG(Op, DAG); + case ISD::INSERT_VECTOR_ELT: + return lowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: + return lowerEXTRACT_VECTOR_ELT(Op, DAG); + #define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME: #include "VVPNodes.def" return lowerToVVP(Op, DAG); @@ -1948,3 +1963,100 @@ } llvm_unreachable("lowerToVVP called for unexpected SDNode."); } + +SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); + MVT VT = Op.getOperand(0).getSimpleValueType(); + + // Special treatment for packed V64 types. + assert(VT == MVT::v512i32 || VT == MVT::v512f32); + // Example of codes: + // %packed_v = extractelt %vr, %idx / 2 + // %v = %packed_v >> (%idx % 2 * 32) + // %res = %v & 0xffffffff + + SDValue Vec = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + SDLoc DL(Op); + SDValue Result = Op; + if (0 /* Idx->isConstant() */) { + // TODO: optimized implementation using constant values + } else { + SDValue Const1 = DAG.getConstant(1, DL, MVT::i64); + SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1}); + SDValue PackedElt = + SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0); + SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1}); + SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1}); + SDValue Const5 = DAG.getConstant(5, DL, MVT::i64); + Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5}); + PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift}); + SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64); + PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask}); + SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32); + Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::i32, PackedElt, SubI32), + 0); + + if (Op.getValueType() == MVT::f32) { + Result = DAG.getBitcast(MVT::f32, Result); + } else { + assert(Op.getValueType() == MVT::i32); + } + } + return Result; +} + +SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); + MVT VT = Op.getOperand(0).getSimpleValueType(); + + // Special treatment for packed V64 types. + assert(VT == MVT::v512i32 || VT == MVT::v512f32); + // The v512i32 and v512f32 starts from upper bits (0..31). This "upper + // bits" required `val << 32` from C implementation's point of view. + // + // Example of codes: + // %packed_elt = extractelt %vr, (%idx >> 1) + // %shift = ((%idx & 1) ^ 1) << 5 + // %packed_elt &= 0xffffffff00000000 >> shift + // %packed_elt |= (zext %val) << shift + // %vr = insertelt %vr, %packed_elt, (%idx >> 1) + + SDLoc DL(Op); + SDValue Vec = Op.getOperand(0); + SDValue Val = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + if (Idx.getValueType() == MVT::i32) + Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx); + if (Val.getValueType() == MVT::f32) + Val = DAG.getBitcast(MVT::i32, Val); + assert(Val.getValueType() == MVT::i32); + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); + + SDValue Result = Op; + if (0 /* Idx->isConstant()*/) { + // TODO: optimized implementation using constant values + } else { + SDValue Const1 = DAG.getConstant(1, DL, MVT::i64); + SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1}); + SDValue PackedElt = + SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0); + SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1}); + SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1}); + SDValue Const5 = DAG.getConstant(5, DL, MVT::i64); + Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5}); + SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64); + Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift}); + PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask}); + Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift}); + PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val}); + Result = + SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(), + {HalfIdx, PackedElt, Vec}), + 0); + } + return Result; +} diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -1558,6 +1558,17 @@ // Pattern Matchings //===----------------------------------------------------------------------===// +// Basic cast between registers. This is often used in ISel patterns, so make +// them as OutPatFrag. +def i2l : OutPatFrag<(ops node:$exp), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_i32)>; +def l2i : OutPatFrag<(ops node:$exp), + (EXTRACT_SUBREG $exp, sub_i32)>; +def f2l : OutPatFrag<(ops node:$exp), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_f32)>; +def l2f : OutPatFrag<(ops node:$exp), + (EXTRACT_SUBREG $exp, sub_f32)>; + // Small immediates. def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>; def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>; @@ -1771,9 +1782,6 @@ defm : ATMLDm; defm : ATMLDm; -def i2l : OutPatFrag<(ops node:$exp), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_i32)>; - // Optimized atomic loads with sext multiclass SXATMLDm { +multiclass vbrd_elem32 { // VBRDil def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)), (VBRDil (ImmCast $sy), i32:$vl)>; @@ -23,13 +23,10 @@ // VBRDrl def : Pat<(v32 (vec_broadcast s32:$sy, i32:$vl)), (VBRDrl - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $sy, SubRegIdx), + (SuperRegCast $sy), i32:$vl)>; } -defm : vbrd_elem32; -defm : vbrd_elem32; - multiclass vbrd_elem64 { // VBRDil def : Pat<(v64 (vec_broadcast (s64 ImmOp:$sy), i32:$vl)), @@ -40,5 +37,50 @@ (VBRDrl s64:$sy, i32:$vl)>; } -defm : vbrd_elem64; -defm : vbrd_elem64; +multiclass extract_insert_elem32 { + // LVSvi + def: Pat<(s32 (extractelt v32:$vec, uimm7:$idx)), + (SubRegCast (LVSvi v32:$vec, (ULO7 $idx)))>; + // LVSvr + def: Pat<(s32 (extractelt v32:$vec, i64:$idx)), + (SubRegCast (LVSvr v32:$vec, $idx))>; + + // LSVir + def: Pat<(v32 (insertelt v32:$vec, s32:$val, uimm7:$idx)), + (LSVir_v (ULO7 $idx), (SuperRegCast $val), $vec)>; + // LSVrr + def: Pat<(v32 (insertelt v32:$vec, s32:$val, i64:$idx)), + (LSVrr_v $idx, (SuperRegCast $val), $vec)>; +} + +multiclass extract_insert_elem64 { + // LVSvi + def: Pat<(s64 (extractelt v64:$vec, uimm7:$idx)), + (LVSvi v64:$vec, (ULO7 $idx))>; + // LVSvr + def: Pat<(s64 (extractelt v64:$vec, i64:$idx)), + (LVSvr v64:$vec, $idx)>; + + // LSVir + def: Pat<(v64 (insertelt v64:$vec, s64:$val, uimm7:$idx)), + (LSVir_v (ULO7 $idx), $val, $vec)>; + // LSVrr + def: Pat<(v64 (insertelt v64:$vec, s64:$val, i64:$idx)), + (LSVrr_v $idx, $val, $vec)>; +} + +multiclass patterns_elem32 { + defm : vbrd_elem32; + defm: extract_insert_elem32; +} + +multiclass patterns_elem64 { + defm : vbrd_elem64; + defm : extract_insert_elem64; +} + +defm : patterns_elem32; +defm : patterns_elem32; + +defm : patterns_elem64; +defm : patterns_elem64; diff --git a/llvm/test/CodeGen/VE/Vector/extract_elt.ll b/llvm/test/CodeGen/VE/Vector/extract_elt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/extract_elt.ll @@ -0,0 +1,198 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + + +;;; <256 x i64> + +define fastcc i64 @extract_rr_v256i64(i32 %idx, <256 x i64> %v) { +; CHECK-LABEL: extract_rr_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i64> %v, i32 %idx + ret i64 %ret +} + +define fastcc i64 @extract_ri7_v256i64(<256 x i64> %v) { +; CHECK-LABEL: extract_ri7_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i64> %v, i32 127 + ret i64 %ret +} + +define fastcc i64 @extract_ri8_v256i64(<256 x i64> %v) { +; CHECK-LABEL: extract_ri8_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i64> %v, i32 128 + ret i64 %ret +} + +define fastcc i64 @extract_ri_v512i64(<512 x i64> %v) { +; CHECK-LABEL: extract_ri_v512i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v1(116) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x i64> %v, i32 372 + ret i64 %ret +} + +;;; <256 x i32> + +define fastcc i32 @extract_rr_v256i32(i32 %idx, <256 x i32> %v) { +; CHECK-LABEL: extract_rr_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i32> %v, i32 %idx + ret i32 %ret +} + +define fastcc i32 @extract_ri7_v256i32(<256 x i32> %v) { +; CHECK-LABEL: extract_ri7_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i32> %v, i32 127 + ret i32 %ret +} + +define fastcc i32 @extract_ri8_v256i32(<256 x i32> %v) { +; CHECK-LABEL: extract_ri8_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x i32> %v, i32 128 + ret i32 %ret +} + +define fastcc i32 @extract_ri_v512i32(<512 x i32> %v) { +; CHECK-LABEL: extract_ri_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 186 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: srl %s0, %s0, 32 +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x i32> %v, i32 372 + ret i32 %ret +} + +define fastcc i32 @extract_rr_v512i32(<512 x i32> %v, i32 %idx) { +; CHECK-LABEL: extract_rr_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvs %s1, %v0(%s1) +; CHECK-NEXT: nnd %s0, %s0, (63)0 +; CHECK-NEXT: sla.w.sx %s0, %s0, 5 +; CHECK-NEXT: srl %s0, %s1, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x i32> %v, i32 %idx + ret i32 %ret +} + +;;; <256 x double> + +define fastcc double @extract_rr_v256f64(i32 %idx, <256 x double> %v) { +; CHECK-LABEL: extract_rr_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x double> %v, i32 %idx + ret double %ret +} + +define fastcc double @extract_ri7_v256f64(<256 x double> %v) { +; CHECK-LABEL: extract_ri7_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x double> %v, i32 127 + ret double %ret +} + +define fastcc double @extract_ri8_v256f64(<256 x double> %v) { +; CHECK-LABEL: extract_ri8_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x double> %v, i32 128 + ret double %ret +} + +define fastcc double @extract_ri_v512f64(<512 x double> %v) { +; CHECK-LABEL: extract_ri_v512f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v1(116) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x double> %v, i32 372 + ret double %ret +} + +;;; <256 x float> + +define fastcc float @extract_rr_v256f32(i32 %idx, <256 x float> %v) { +; CHECK-LABEL: extract_rr_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x float> %v, i32 %idx + ret float %ret +} + +define fastcc float @extract_ri7_v256f32(<256 x float> %v) { +; CHECK-LABEL: extract_ri7_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lvs %s0, %v0(127) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x float> %v, i32 127 + ret float %ret +} + +define fastcc float @extract_ri8_v256f32(<256 x float> %v) { +; CHECK-LABEL: extract_ri8_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 128 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <256 x float> %v, i32 128 + ret float %ret +} + +define fastcc float @extract_ri_v512f32(<512 x float> %v) { +; CHECK-LABEL: extract_ri_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 186 +; CHECK-NEXT: lvs %s0, %v0(%s0) +; CHECK-NEXT: srl %s0, %s0, 32 +; CHECK-NEXT: sll %s0, %s0, 32 +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x float> %v, i32 372 + ret float %ret +} + +define fastcc float @extract_rr_v512f32(<512 x float> %v, i32 %idx) { +; CHECK-LABEL: extract_rr_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: srl %s1, %s1, 1 +; CHECK-NEXT: lvs %s1, %v0(%s1) +; CHECK-NEXT: nnd %s0, %s0, (63)0 +; CHECK-NEXT: sla.w.sx %s0, %s0, 5 +; CHECK-NEXT: srl %s0, %s1, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: sll %s0, %s0, 32 +; CHECK-NEXT: b.l.t (, %s10) + %ret = extractelement <512 x float> %v, i32 %idx + ret float %ret +} diff --git a/llvm/test/CodeGen/VE/Vector/insert_elt.ll b/llvm/test/CodeGen/VE/Vector/insert_elt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/insert_elt.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + + +;;; <256 x i64> + +define fastcc <256 x i64> @insert_rr_v256i64(i32 %idx, i64 %s) { +; CHECK-LABEL: insert_rr_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i64> undef, i64 %s, i32 %idx + ret <256 x i64> %ret +} + +define fastcc <256 x i64> @insert_ri7_v256i64(i64 %s) { +; CHECK-LABEL: insert_ri7_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i64> undef, i64 %s, i32 127 + ret <256 x i64> %ret +} + +define fastcc <256 x i64> @insert_ri8_v256i64(i64 %s) { +; CHECK-LABEL: insert_ri8_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i64> undef, i64 %s, i32 128 + ret <256 x i64> %ret +} + +define fastcc <512 x i64> @insert_ri_v512i64(i64 %s) { +; CHECK-LABEL: insert_ri_v512i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v1(116), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x i64> undef, i64 %s, i32 372 + ret <512 x i64> %ret +} + +;;; <256 x i32> + +define fastcc <256 x i32> @insert_rr_v256i32(i32 %idx, i32 %s) { +; CHECK-LABEL: insert_rr_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i32> undef, i32 %s, i32 %idx + ret <256 x i32> %ret +} + +define fastcc <256 x i32> @insert_ri7_v256i32(i32 %s) { +; CHECK-LABEL: insert_ri7_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i32> undef, i32 %s, i32 127 + ret <256 x i32> %ret +} + +define fastcc <256 x i32> @insert_ri8_v256i32(i32 %s) { +; CHECK-LABEL: insert_ri8_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x i32> undef, i32 %s, i32 128 + ret <256 x i32> %ret +} + +define fastcc <512 x i32> @insert_ri_v512i32(i32 %s) { +; CHECK-LABEL: insert_ri_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 186 +; CHECK-NEXT: lvs %s2, %v0(%s1) +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: sll %s0, %s0, 32 +; CHECK-NEXT: or %s0, %s2, %s0 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x i32> undef, i32 %s, i32 372 + ret <512 x i32> %ret +} + +define fastcc <512 x i32> @insert_rr_v512i32(i32 %idx, i32 %s) { +; CHECK-LABEL: insert_rr_v512i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: nnd %s2, %s0, (63)0 +; CHECK-NEXT: sla.w.sx %s2, %s2, 5 +; CHECK-NEXT: sll %s1, %s1, %s2 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: srl %s0, %s0, 1 +; CHECK-NEXT: lvs %s3, %v0(%s0) +; CHECK-NEXT: srl %s2, (32)1, %s2 +; CHECK-NEXT: and %s2, %s3, %s2 +; CHECK-NEXT: or %s1, %s2, %s1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x i32> undef, i32 %s, i32 %idx + ret <512 x i32> %ret +} + +;;; <256 x double> + +define fastcc <256 x double> @insert_rr_v256f64(i32 %idx, double %s) { +; CHECK-LABEL: insert_rr_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x double> undef, double %s, i32 %idx + ret <256 x double> %ret +} + +define fastcc <256 x double> @insert_ri7_v256f64(double %s) { +; CHECK-LABEL: insert_ri7_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x double> undef, double %s, i32 127 + ret <256 x double> %ret +} + +define fastcc <256 x double> @insert_ri8_v256f64(double %s) { +; CHECK-LABEL: insert_ri8_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x double> undef, double %s, i32 128 + ret <256 x double> %ret +} + +define fastcc <512 x double> @insert_ri_v512f64(double %s) { +; CHECK-LABEL: insert_ri_v512f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v1(116), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x double> undef, double %s, i32 372 + ret <512 x double> %ret +} + +;;; <256 x float> + +define fastcc <256 x float> @insert_rr_v256f32(i32 %idx, float %s) { +; CHECK-LABEL: insert_rr_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lsv %v0(%s0), %s1 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x float> undef, float %s, i32 %idx + ret <256 x float> %ret +} + +define fastcc <256 x float> @insert_ri7_v256f32(float %s) { +; CHECK-LABEL: insert_ri7_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lsv %v0(127), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x float> undef, float %s, i32 127 + ret <256 x float> %ret +} + +define fastcc <256 x float> @insert_ri8_v256f32(float %s) { +; CHECK-LABEL: insert_ri8_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 128 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <256 x float> undef, float %s, i32 128 + ret <256 x float> %ret +} + +define fastcc <512 x float> @insert_ri_v512f32(float %s) { +; CHECK-LABEL: insert_ri_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: sra.l %s0, %s0, 32 +; CHECK-NEXT: lea %s1, 186 +; CHECK-NEXT: lvs %s2, %v0(%s1) +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: sll %s0, %s0, 32 +; CHECK-NEXT: or %s0, %s2, %s0 +; CHECK-NEXT: lsv %v0(%s1), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x float> undef, float %s, i32 372 + ret <512 x float> %ret +} + +define fastcc <512 x float> @insert_rr_v512f32(i32 %idx, float %s) { +; CHECK-LABEL: insert_rr_v512f32: +; CHECK: # %bb.0: +; CHECK-NEXT: sra.l %s1, %s1, 32 +; CHECK-NEXT: adds.w.sx %s2, %s0, (0)1 +; CHECK-NEXT: srl %s2, %s2, 1 +; CHECK-NEXT: lvs %s3, %v0(%s2) +; CHECK-NEXT: nnd %s0, %s0, (63)0 +; CHECK-NEXT: sla.w.sx %s0, %s0, 5 +; CHECK-NEXT: srl %s4, (32)1, %s0 +; CHECK-NEXT: and %s3, %s3, %s4 +; CHECK-NEXT: adds.w.zx %s1, %s1, (0)1 +; CHECK-NEXT: sll %s0, %s1, %s0 +; CHECK-NEXT: or %s0, %s3, %s0 +; CHECK-NEXT: lsv %v0(%s2), %s0 +; CHECK-NEXT: b.l.t (, %s10) + %ret = insertelement <512 x float> undef, float %s, i32 %idx + ret <512 x float> %ret +}