diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td --- a/llvm/lib/Target/VE/VECallingConv.td +++ b/llvm/lib/Target/VE/VECallingConv.td @@ -103,14 +103,7 @@ // handled conforming to the standard cc. def CC_VE_Fast : CallingConv<[ // vector --> generic vector registers - CCIfType<[v2i32, v2i64, v2f32, v2f64, - v4i32, v4i64, v4f32, v4f64, - v8i32, v8i64, v8f32, v8f64, - v16i32, v16i64, v16f32, v16f64, - v32i32, v32i64, v32f32, v32f64, - v64i32, v64i64, v64f32, v64f64, - v128i32, v128i64, v128f32, v128f64, - v256i32, v256f32, v256i64, v256f64], + CCIfType<[v256i32, v256f32, v256i64, v256f64], CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, // TODO: make this conditional on packed mode CCIfType<[v512i32, v512f32], @@ -131,14 +124,7 @@ def RetCC_VE_Fast : CallingConv<[ // vector --> generic vector registers - CCIfType<[v2i32, v2i64, v2f32, v2f64, - v4i32, v4i64, v4f32, v4f64, - v8i32, v8i64, v8f32, v8f64, - v16i32, v16i64, v16f32, v16f64, - v32i32, v32i64, v32f32, v32f64, - v64i32, v64i64, v64f32, v64f64, - v128i32, v128i64, v128f32, v128f64, - v256i32, v256f32, v256i64, v256f64], + CCIfType<[v256i32, v256f32, v256i64, v256f64], CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, // TODO: make this conditional on packed mode CCIfType<[v512i32, v512f32], diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -34,6 +34,8 @@ MEMBARRIER, // Compiler barrier only; generate a no-op. + VEC_BROADCAST, // 0: scalar value, 1: VL + CALL, // A call instruction. RET_FLAG, // Return with a flag operand. GLOBAL_BASE_REG, // Global base reg for PIC. @@ -114,6 +116,8 @@ SDValue lowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; /// } Custom Lower /// Custom DAGCombine { diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -70,6 +70,11 @@ return CCInfo.CheckReturn(Outs, RetCC); } +static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64, + MVT::v256f32, MVT::v512f32, MVT::v256f64}; + +static const MVT AllMaskVTs[] = {MVT::v256i1, MVT::v512i1}; + void VETargetLowering::initRegisterClasses() { // Set up the register classes. addRegisterClass(MVT::i32, &VE::I32RegClass); @@ -79,46 +84,10 @@ addRegisterClass(MVT::f128, &VE::F128RegClass); if (Subtarget->enableVPU()) { - addRegisterClass(MVT::v2i32, &VE::V64RegClass); - addRegisterClass(MVT::v4i32, &VE::V64RegClass); - addRegisterClass(MVT::v8i32, &VE::V64RegClass); - addRegisterClass(MVT::v16i32, &VE::V64RegClass); - addRegisterClass(MVT::v32i32, &VE::V64RegClass); - addRegisterClass(MVT::v64i32, &VE::V64RegClass); - addRegisterClass(MVT::v128i32, &VE::V64RegClass); - addRegisterClass(MVT::v256i32, &VE::V64RegClass); - addRegisterClass(MVT::v512i32, &VE::V64RegClass); - - addRegisterClass(MVT::v2i64, &VE::V64RegClass); - addRegisterClass(MVT::v4i64, &VE::V64RegClass); - addRegisterClass(MVT::v8i64, &VE::V64RegClass); - addRegisterClass(MVT::v16i64, &VE::V64RegClass); - addRegisterClass(MVT::v32i64, &VE::V64RegClass); - addRegisterClass(MVT::v64i64, &VE::V64RegClass); - addRegisterClass(MVT::v128i64, &VE::V64RegClass); - addRegisterClass(MVT::v256i64, &VE::V64RegClass); - - addRegisterClass(MVT::v2f32, &VE::V64RegClass); - addRegisterClass(MVT::v4f32, &VE::V64RegClass); - addRegisterClass(MVT::v8f32, &VE::V64RegClass); - addRegisterClass(MVT::v16f32, &VE::V64RegClass); - addRegisterClass(MVT::v32f32, &VE::V64RegClass); - addRegisterClass(MVT::v64f32, &VE::V64RegClass); - addRegisterClass(MVT::v128f32, &VE::V64RegClass); - addRegisterClass(MVT::v256f32, &VE::V64RegClass); - addRegisterClass(MVT::v512f32, &VE::V64RegClass); - - addRegisterClass(MVT::v2f64, &VE::V64RegClass); - addRegisterClass(MVT::v4f64, &VE::V64RegClass); - addRegisterClass(MVT::v8f64, &VE::V64RegClass); - addRegisterClass(MVT::v16f64, &VE::V64RegClass); - addRegisterClass(MVT::v32f64, &VE::V64RegClass); - addRegisterClass(MVT::v64f64, &VE::V64RegClass); - addRegisterClass(MVT::v128f64, &VE::V64RegClass); - addRegisterClass(MVT::v256f64, &VE::V64RegClass); - - addRegisterClass(MVT::v256i1, &VE::VMRegClass); - addRegisterClass(MVT::v512i1, &VE::VM512RegClass); + for (MVT VecVT : AllVectorVTs) + addRegisterClass(VecVT, &VE::V64RegClass); + for (MVT MaskVT : AllMaskVTs) + addRegisterClass(MaskVT, &VE::VMRegClass); } } @@ -285,7 +254,8 @@ } void VETargetLowering::initVPUActions() { - // TODO upstream vector isel + for (MVT LegalVecVT : AllVectorVTs) + setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom); } SDValue @@ -898,6 +868,7 @@ TARGET_NODE_CASE(GETTLSADDR) TARGET_NODE_CASE(MEMBARRIER) TARGET_NODE_CASE(CALL) + TARGET_NODE_CASE(VEC_BROADCAST) TARGET_NODE_CASE(RET_FLAG) TARGET_NODE_CASE(GLOBAL_BASE_REG) } @@ -1403,6 +1374,32 @@ return DAG.getMergeValues(Ops, DL); } +static SDValue getSplatValue(SDNode *N) { + if (auto *BuildVec = dyn_cast(N)) { + return BuildVec->getSplatValue(); + } + return SDValue(); +} + +SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + unsigned NumEls = Op.getValueType().getVectorNumElements(); + MVT ElemVT = Op.getSimpleValueType().getVectorElementType(); + + if (SDValue ScalarV = getSplatValue(Op.getNode())) { + // lower to VEC_BROADCAST + MVT LegalResVT = MVT::getVectorVT(ElemVT, 256); + + auto AVL = DAG.getConstant(NumEls, DL, MVT::i32); + return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0), + AVL); + } + + // Expand + return SDValue(); +} + SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: @@ -1423,6 +1420,8 @@ return lowerJumpTable(Op, DAG); case ISD::LOAD: return lowerLOAD(Op, DAG); + case ISD::BUILD_VECTOR: + return lowerBUILD_VECTOR(Op, DAG); case ISD::STORE: return lowerSTORE(Op, DAG); case ISD::VASTART: diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -2224,3 +2224,6 @@ // The vevlintrin include "VEInstrIntrinsicVL.td" + +// Patterns and intermediate SD nodes (VEC_*). +include "VEInstrPatternsVec.td" diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td @@ -0,0 +1,48 @@ +//===-- VEInstrPatternsVec.td - VEC_-type SDNodes and isel for VE Target --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the VEC_* prefixed intermediate SDNodes and their +// isel patterns. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// + +// Custom intermediate ISDs. +class IsVLVT : SDTCisVT; +def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2, [SDTCisVec<0>, IsVLVT<2>]>>; + +multiclass vbrd_elem32 { + // VBRDil + def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)), + (VBRDil (ImmCast $sy), i32:$vl)>; + + // VBRDrl + def : Pat<(v32 (vec_broadcast s32:$sy, i32:$vl)), + (VBRDrl + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $sy, SubRegIdx), + i32:$vl)>; +} + +defm : vbrd_elem32; +defm : vbrd_elem32; + +multiclass vbrd_elem64 { + // VBRDil + def : Pat<(v64 (vec_broadcast (s64 ImmOp:$sy), i32:$vl)), + (VBRDil (ImmCast $sy), i32:$vl)>; + + // VBRDrl + def : Pat<(v64 (vec_broadcast s64:$sy, i32:$vl)), + (VBRDrl s64:$sy, i32:$vl)>; +} + +defm : vbrd_elem64; +defm : vbrd_elem64; diff --git a/llvm/lib/Target/VE/VERegisterInfo.td b/llvm/lib/Target/VE/VERegisterInfo.td --- a/llvm/lib/Target/VE/VERegisterInfo.td +++ b/llvm/lib/Target/VE/VERegisterInfo.td @@ -185,14 +185,7 @@ def V64 : RegisterClass<"VE", [v256f64, // default type for vector registers v512i32, v512f32, - v256i64, v256i32, v256f32, /* v256f64, */ - v128i64, v128i32, v128f32, v128f64, - v64i64, v64i32, v64f32, v64f64, - v32i64, v32i32, v32f32, v32f64, - v16i64, v16i32, v16f32, v16f64, - v8i64, v8i32, v8f32, v8f64, - v4i64, v4i32, v4f32, v4f64, - v2i64, v2i32, v2f32, v2f64], 64, + v256i64, v256i32, v256f32, /* v256f64, */], 64, (add (sequence "V%u", 0, 63), VIX)>; diff --git a/llvm/test/CodeGen/VE/Vector/vec_broadcast.ll b/llvm/test/CodeGen/VE/Vector/vec_broadcast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_broadcast.ll @@ -0,0 +1,328 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +; ISA-compatible vector broadcasts +define fastcc <256 x i64> @brd_v256i64(i64 %s) { +; CHECK-LABEL: brd_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x i64> undef, i64 %s, i32 0 + %ret = shufflevector <256 x i64> %val, <256 x i64> undef, <256 x i32> zeroinitializer + ret <256 x i64> %ret +} + +define fastcc <256 x i64> @brdi_v256i64() { +; CHECK-LABEL: brdi_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vbrd %v0, 1 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x i64> undef, i64 1, i32 0 + %ret = shufflevector <256 x i64> %val, <256 x i64> undef, <256 x i32> zeroinitializer + ret <256 x i64> %ret +} + +define fastcc <256 x double> @brd_v256f64(double %s) { +; CHECK-LABEL: brd_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x double> undef, double %s, i32 0 + %ret = shufflevector <256 x double> %val, <256 x double> undef, <256 x i32> zeroinitializer + ret <256 x double> %ret +} + +define fastcc <256 x double> @brdi_v256f64() { +; CHECK-LABEL: brdi_v256f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vbrd %v0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x double> undef, double 0.e+00, i32 0 + %ret = shufflevector <256 x double> %val, <256 x double> undef, <256 x i32> zeroinitializer + ret <256 x double> %ret +} + +define fastcc <256 x i32> @brd_v256i32(i32 %s) { +; CHECK-LABEL: brd_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x i32> undef, i32 %s, i32 0 + %ret = shufflevector <256 x i32> %val, <256 x i32> undef, <256 x i32> zeroinitializer + ret <256 x i32> %ret +} + +define fastcc <256 x i32> @brdi_v256i32(i32 %s) { +; CHECK-LABEL: brdi_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vbrd %v0, 13 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x i32> undef, i32 13, i32 0 + %ret = shufflevector <256 x i32> %val, <256 x i32> undef, <256 x i32> zeroinitializer + ret <256 x i32> %ret +} + +define fastcc <256 x float> @brd_v256f32(float %s) { +; CHECK-LABEL: brd_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x float> undef, float %s, i32 0 + %ret = shufflevector <256 x float> %val, <256 x float> undef, <256 x i32> zeroinitializer + ret <256 x float> %ret +} + +define fastcc <256 x float> @brdi_v256f32(float %s) { +; CHECK-LABEL: brdi_v256f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vbrd %v0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x float> undef, float 0.e+00, i32 0 + %ret = shufflevector <256 x float> %val, <256 x float> undef, <256 x i32> zeroinitializer + ret <256 x float> %ret +} + + +; Shorter vectors, we expect these to be widened (for now). +define fastcc <128 x i64> @brd_v128i64(i64 %s) { +; CHECK-LABEL: brd_v128i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <128 x i64> undef, i64 %s, i32 0 + %ret = shufflevector <128 x i64> %val, <128 x i64> undef, <128 x i32> zeroinitializer + ret <128 x i64> %ret +} + +define fastcc <128 x double> @brd_v128f64(double %s) { +; CHECK-LABEL: brd_v128f64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <128 x double> undef, double %s, i32 0 + %ret = shufflevector <128 x double> %val, <128 x double> undef, <128 x i32> zeroinitializer + ret <128 x double> %ret +} + +define fastcc <128 x i32> @brd_v128i32(i32 %s) { +; CHECK-LABEL: brd_v128i32: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <128 x i32> undef, i32 %s, i32 0 + %ret = shufflevector <128 x i32> %val, <128 x i32> undef, <128 x i32> zeroinitializer + ret <128 x i32> %ret +} + +define fastcc <128 x i32> @brdi_v128i32(i32 %s) { +; CHECK-LABEL: brdi_v128i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vbrd %v0, 13 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <128 x i32> undef, i32 13, i32 0 + %ret = shufflevector <128 x i32> %val, <128 x i32> undef, <128 x i32> zeroinitializer + ret <128 x i32> %ret +} + +define fastcc <128 x float> @brd_v128f32(float %s) { +; CHECK-LABEL: brd_v128f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <128 x float> undef, float %s, i32 0 + %ret = shufflevector <128 x float> %val, <128 x float> undef, <128 x i32> zeroinitializer + ret <128 x float> %ret +} + +define fastcc <128 x float> @brdi_v128f32(float %s) { +; CHECK-LABEL: brdi_v128f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vbrd %v0, 0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <128 x float> undef, float 0.e+00, i32 0 + %ret = shufflevector <128 x float> %val, <128 x float> undef, <128 x i32> zeroinitializer + ret <128 x float> %ret +} + +; Vectors with small element types and valid element count, we expect those to be promoted. +define fastcc <256 x i16> @brd_v256i16(i16 %s) { +; CHECK-LABEL: brd_v256i16: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: lea %s1, 256 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vbrd %v0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <256 x i16> undef, i16 %s, i32 0 + %ret = shufflevector <256 x i16> %val, <256 x i16> undef, <256 x i32> zeroinitializer + ret <256 x i16> %ret +} + +; Vectors with small element types and low element count, these are scalarized for now. +; FIXME Promote + Widen +define fastcc <128 x i16> @brd_v128i16(i16 %s) { +; CHECK-LABEL: brd_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: st2b %s1, 254(, %s0) +; CHECK-NEXT: st2b %s1, 252(, %s0) +; CHECK-NEXT: st2b %s1, 250(, %s0) +; CHECK-NEXT: st2b %s1, 248(, %s0) +; CHECK-NEXT: st2b %s1, 246(, %s0) +; CHECK-NEXT: st2b %s1, 244(, %s0) +; CHECK-NEXT: st2b %s1, 242(, %s0) +; CHECK-NEXT: st2b %s1, 240(, %s0) +; CHECK-NEXT: st2b %s1, 238(, %s0) +; CHECK-NEXT: st2b %s1, 236(, %s0) +; CHECK-NEXT: st2b %s1, 234(, %s0) +; CHECK-NEXT: st2b %s1, 232(, %s0) +; CHECK-NEXT: st2b %s1, 230(, %s0) +; CHECK-NEXT: st2b %s1, 228(, %s0) +; CHECK-NEXT: st2b %s1, 226(, %s0) +; CHECK-NEXT: st2b %s1, 224(, %s0) +; CHECK-NEXT: st2b %s1, 222(, %s0) +; CHECK-NEXT: st2b %s1, 220(, %s0) +; CHECK-NEXT: st2b %s1, 218(, %s0) +; CHECK-NEXT: st2b %s1, 216(, %s0) +; CHECK-NEXT: st2b %s1, 214(, %s0) +; CHECK-NEXT: st2b %s1, 212(, %s0) +; CHECK-NEXT: st2b %s1, 210(, %s0) +; CHECK-NEXT: st2b %s1, 208(, %s0) +; CHECK-NEXT: st2b %s1, 206(, %s0) +; CHECK-NEXT: st2b %s1, 204(, %s0) +; CHECK-NEXT: st2b %s1, 202(, %s0) +; CHECK-NEXT: st2b %s1, 200(, %s0) +; CHECK-NEXT: st2b %s1, 198(, %s0) +; CHECK-NEXT: st2b %s1, 196(, %s0) +; CHECK-NEXT: st2b %s1, 194(, %s0) +; CHECK-NEXT: st2b %s1, 192(, %s0) +; CHECK-NEXT: st2b %s1, 190(, %s0) +; CHECK-NEXT: st2b %s1, 188(, %s0) +; CHECK-NEXT: st2b %s1, 186(, %s0) +; CHECK-NEXT: st2b %s1, 184(, %s0) +; CHECK-NEXT: st2b %s1, 182(, %s0) +; CHECK-NEXT: st2b %s1, 180(, %s0) +; CHECK-NEXT: st2b %s1, 178(, %s0) +; CHECK-NEXT: st2b %s1, 176(, %s0) +; CHECK-NEXT: st2b %s1, 174(, %s0) +; CHECK-NEXT: st2b %s1, 172(, %s0) +; CHECK-NEXT: st2b %s1, 170(, %s0) +; CHECK-NEXT: st2b %s1, 168(, %s0) +; CHECK-NEXT: st2b %s1, 166(, %s0) +; CHECK-NEXT: st2b %s1, 164(, %s0) +; CHECK-NEXT: st2b %s1, 162(, %s0) +; CHECK-NEXT: st2b %s1, 160(, %s0) +; CHECK-NEXT: st2b %s1, 158(, %s0) +; CHECK-NEXT: st2b %s1, 156(, %s0) +; CHECK-NEXT: st2b %s1, 154(, %s0) +; CHECK-NEXT: st2b %s1, 152(, %s0) +; CHECK-NEXT: st2b %s1, 150(, %s0) +; CHECK-NEXT: st2b %s1, 148(, %s0) +; CHECK-NEXT: st2b %s1, 146(, %s0) +; CHECK-NEXT: st2b %s1, 144(, %s0) +; CHECK-NEXT: st2b %s1, 142(, %s0) +; CHECK-NEXT: st2b %s1, 140(, %s0) +; CHECK-NEXT: st2b %s1, 138(, %s0) +; CHECK-NEXT: st2b %s1, 136(, %s0) +; CHECK-NEXT: st2b %s1, 134(, %s0) +; CHECK-NEXT: st2b %s1, 132(, %s0) +; CHECK-NEXT: st2b %s1, 130(, %s0) +; CHECK-NEXT: st2b %s1, 128(, %s0) +; CHECK-NEXT: st2b %s1, 126(, %s0) +; CHECK-NEXT: st2b %s1, 124(, %s0) +; CHECK-NEXT: st2b %s1, 122(, %s0) +; CHECK-NEXT: st2b %s1, 120(, %s0) +; CHECK-NEXT: st2b %s1, 118(, %s0) +; CHECK-NEXT: st2b %s1, 116(, %s0) +; CHECK-NEXT: st2b %s1, 114(, %s0) +; CHECK-NEXT: st2b %s1, 112(, %s0) +; CHECK-NEXT: st2b %s1, 110(, %s0) +; CHECK-NEXT: st2b %s1, 108(, %s0) +; CHECK-NEXT: st2b %s1, 106(, %s0) +; CHECK-NEXT: st2b %s1, 104(, %s0) +; CHECK-NEXT: st2b %s1, 102(, %s0) +; CHECK-NEXT: st2b %s1, 100(, %s0) +; CHECK-NEXT: st2b %s1, 98(, %s0) +; CHECK-NEXT: st2b %s1, 96(, %s0) +; CHECK-NEXT: st2b %s1, 94(, %s0) +; CHECK-NEXT: st2b %s1, 92(, %s0) +; CHECK-NEXT: st2b %s1, 90(, %s0) +; CHECK-NEXT: st2b %s1, 88(, %s0) +; CHECK-NEXT: st2b %s1, 86(, %s0) +; CHECK-NEXT: st2b %s1, 84(, %s0) +; CHECK-NEXT: st2b %s1, 82(, %s0) +; CHECK-NEXT: st2b %s1, 80(, %s0) +; CHECK-NEXT: st2b %s1, 78(, %s0) +; CHECK-NEXT: st2b %s1, 76(, %s0) +; CHECK-NEXT: st2b %s1, 74(, %s0) +; CHECK-NEXT: st2b %s1, 72(, %s0) +; CHECK-NEXT: st2b %s1, 70(, %s0) +; CHECK-NEXT: st2b %s1, 68(, %s0) +; CHECK-NEXT: st2b %s1, 66(, %s0) +; CHECK-NEXT: st2b %s1, 64(, %s0) +; CHECK-NEXT: st2b %s1, 62(, %s0) +; CHECK-NEXT: st2b %s1, 60(, %s0) +; CHECK-NEXT: st2b %s1, 58(, %s0) +; CHECK-NEXT: st2b %s1, 56(, %s0) +; CHECK-NEXT: st2b %s1, 54(, %s0) +; CHECK-NEXT: st2b %s1, 52(, %s0) +; CHECK-NEXT: st2b %s1, 50(, %s0) +; CHECK-NEXT: st2b %s1, 48(, %s0) +; CHECK-NEXT: st2b %s1, 46(, %s0) +; CHECK-NEXT: st2b %s1, 44(, %s0) +; CHECK-NEXT: st2b %s1, 42(, %s0) +; CHECK-NEXT: st2b %s1, 40(, %s0) +; CHECK-NEXT: st2b %s1, 38(, %s0) +; CHECK-NEXT: st2b %s1, 36(, %s0) +; CHECK-NEXT: st2b %s1, 34(, %s0) +; CHECK-NEXT: st2b %s1, 32(, %s0) +; CHECK-NEXT: st2b %s1, 30(, %s0) +; CHECK-NEXT: st2b %s1, 28(, %s0) +; CHECK-NEXT: st2b %s1, 26(, %s0) +; CHECK-NEXT: st2b %s1, 24(, %s0) +; CHECK-NEXT: st2b %s1, 22(, %s0) +; CHECK-NEXT: st2b %s1, 20(, %s0) +; CHECK-NEXT: st2b %s1, 18(, %s0) +; CHECK-NEXT: st2b %s1, 16(, %s0) +; CHECK-NEXT: st2b %s1, 14(, %s0) +; CHECK-NEXT: st2b %s1, 12(, %s0) +; CHECK-NEXT: st2b %s1, 10(, %s0) +; CHECK-NEXT: st2b %s1, 8(, %s0) +; CHECK-NEXT: st2b %s1, 6(, %s0) +; CHECK-NEXT: st2b %s1, 4(, %s0) +; CHECK-NEXT: st2b %s1, 2(, %s0) +; CHECK-NEXT: st2b %s1, (, %s0) +; CHECK-NEXT: b.l.t (, %s10) + %val = insertelement <128 x i16> undef, i16 %s, i32 0 + %ret = shufflevector <128 x i16> %val, <128 x i16> undef, <128 x i32> zeroinitializer + ret <128 x i16> %ret +}