diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -114,6 +114,9 @@ const SelectionDAG &DAG, unsigned Depth) const override; + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(MVT VT) const override; + SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -303,9 +303,6 @@ setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal); } - // And some truncating stores are legal as well - setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); - setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } // Don't do anything clever with build_pairs @@ -854,6 +851,21 @@ } } +TargetLoweringBase::LegalizeTypeAction +WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const { + if (VT.isFixedLengthVector()) { + MVT EltVT = VT.getVectorElementType(); + // We have legal vector types with these lane types, so widening the + // vector would let us use some of the lanes directly without having to + // extend or truncate values. + if (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 || + EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) + return TypeWidenVector; + } + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + //===----------------------------------------------------------------------===// // WebAssembly Lowering private implementation. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -267,6 +267,16 @@ defm "" : SIMDLoadZero; defm "" : SIMDLoadZero; +// Use load_zero to load scalars into vectors as well where possible. +// TODO: i32, i16, and i8 scalars +def load_scalar : + PatFrag<(ops node:$addr), (scalar_to_vector (i64 (load $addr)))>; +defm : LoadPatNoOffset; +defm : LoadPatImmOff; +defm : LoadPatImmOff; +defm : LoadPatOffsetOnly; +defm : LoadPatGlobalAddrOffOnly; + // TODO: f32x4 and f64x2 as well foreach vec = [I32x4, I64x2] in { defvar inst = "LOAD_ZERO_"#vec; @@ -1241,87 +1251,6 @@ defm "" : SIMDNarrow; defm "" : SIMDNarrow; -// Use narrowing operations for truncating stores. Since the narrowing -// operations are saturating instead of truncating, we need to mask -// the stored values first. -def store_v8i8_trunc_v8i16 : - OutPatFrag<(ops node:$val), - (EXTRACT_LANE_I64x2 - (NARROW_U_I8x16 - (AND - (CONST_V128_I16x8 - 0x00ff, 0x00ff, 0x00ff, 0x00ff, - 0x00ff, 0x00ff, 0x00ff, 0x00ff), - node:$val), - $val), // Unused input - 0)>; - -def store_v4i16_trunc_v4i32 : - OutPatFrag<(ops node:$val), - (EXTRACT_LANE_I64x2 - (NARROW_U_I16x8 - (AND - (CONST_V128_I32x4 - 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff), - node:$val), - $val), // Unused input - 0)>; - -// Store patterns adapted from WebAssemblyInstrMemory.td -multiclass NarrowingStorePatNoOffset { - defvar node = !cast("truncstorevi"#vec.split.lane_bits); - def : Pat<(node vec.vt:$val, I32:$addr), - (STORE_I64_A32 0, 0, $addr, (out $val))>, - Requires<[HasAddr32]>; - def : Pat<(node vec.vt:$val, I64:$addr), - (STORE_I64_A64 0, 0, $addr, (out $val))>, - Requires<[HasAddr64]>; -} - -defm : NarrowingStorePatNoOffset; -defm : NarrowingStorePatNoOffset; - -multiclass NarrowingStorePatImmOff { - defvar node = !cast("truncstorevi"#vec.split.lane_bits); - def : Pat<(node vec.vt:$val, (operand I32:$addr, imm:$off)), - (STORE_I64_A32 0, imm:$off, $addr, (out $val))>, - Requires<[HasAddr32]>; - def : Pat<(node vec.vt:$val, (operand I64:$addr, imm:$off)), - (STORE_I64_A64 0, imm:$off, $addr, (out $val))>, - Requires<[HasAddr64]>; -} - -defm : NarrowingStorePatImmOff; -defm : NarrowingStorePatImmOff; -defm : NarrowingStorePatImmOff; -defm : NarrowingStorePatImmOff; - -multiclass NarrowingStorePatOffsetOnly { - defvar node = !cast("truncstorevi"#vec.split.lane_bits); - def : Pat<(node vec.vt:$val, imm:$off), - (STORE_I64_A32 0, imm:$off, (CONST_I32 0), (out $val))>, - Requires<[HasAddr32]>; - def : Pat<(node vec.vt:$val, imm:$off), - (STORE_I64_A64 0, imm:$off, (CONST_I64 0), (out $val))>, - Requires<[HasAddr64]>; -} - -defm : NarrowingStorePatOffsetOnly; -defm : NarrowingStorePatOffsetOnly; - -multiclass NarrowingStorePatGlobalAddrOffOnly { - defvar node = !cast("truncstorevi"#vec.split.lane_bits); - def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)), - (STORE_I64_A32 0, tglobaladdr:$off, (CONST_I32 0), (out $val))>, - Requires<[IsNotPIC, HasAddr32]>; - def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)), - (STORE_I64_A64 0, tglobaladdr:$off, (CONST_I64 0), (out $val))>, - Requires<[IsNotPIC, HasAddr64]>; -} - -defm : NarrowingStorePatGlobalAddrOffOnly; -defm : NarrowingStorePatGlobalAddrOffOnly; - // Bitcasts are nops // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types foreach t1 = AllVecs in diff --git a/llvm/test/CodeGen/WebAssembly/simd-concat.ll b/llvm/test/CodeGen/WebAssembly/simd-concat.ll --- a/llvm/test/CodeGen/WebAssembly/simd-concat.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-concat.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> ret <16 x i8> %v @@ -23,7 +23,7 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> ret <8 x i8> %v @@ -35,7 +35,7 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> ret <8 x i16> %v @@ -47,7 +47,7 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK-NEXT: i8x16.shuffle 0, 1, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> ret <4 x i8> %v @@ -59,7 +59,7 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> ret <4 x i16> %v @@ -71,7 +71,7 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> ret <4 x i32> %v diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending.ll b/llvm/test/CodeGen/WebAssembly/simd-extending.ll --- a/llvm/test/CodeGen/WebAssembly/simd-extending.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending.ll @@ -169,29 +169,8 @@ ; CHECK: .functype extend_lowish_i8x16_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 1 -; CHECK-NEXT: i16x8.splat ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 2 -; CHECK-NEXT: i16x8.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 3 -; CHECK-NEXT: i16x8.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 4 -; CHECK-NEXT: i16x8.replace_lane 3 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 5 -; CHECK-NEXT: i16x8.replace_lane 4 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 6 -; CHECK-NEXT: i16x8.replace_lane 5 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 7 -; CHECK-NEXT: i16x8.replace_lane 6 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 8 -; CHECK-NEXT: i16x8.replace_lane 7 +; CHECK-NEXT: i8x16.shuffle 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0 ; CHECK-NEXT: i32.const 8 ; CHECK-NEXT: i16x8.shl ; CHECK-NEXT: i32.const 8 @@ -208,17 +187,8 @@ ; CHECK: .functype extend_lowish_i16x8_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 1 -; CHECK-NEXT: i32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 2 -; CHECK-NEXT: i32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 3 -; CHECK-NEXT: i32x4.replace_lane 2 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 4 -; CHECK-NEXT: i32x4.replace_lane 3 +; CHECK-NEXT: i8x16.shuffle 2, 3, 0, 0, 4, 5, 0, 0, 6, 7, 0, 0, 8, 9, 0, 0 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32x4.shl ; CHECK-NEXT: i32.const 16 diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll --- a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll @@ -294,7 +294,7 @@ ; CHECK: .functype load_ext_v8i16_a1 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 0:p2align=0 +; CHECK-NEXT: v128.load64_zero 0:p2align=0 ; CHECK-NEXT: # fallthrough-return %v = load <8 x i8>, <8 x i8>* %p, align 1 ret <8 x i8> %v @@ -305,7 +305,7 @@ ; CHECK: .functype load_ext_v8i16_a2 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 0:p2align=1 +; CHECK-NEXT: v128.load64_zero 0:p2align=1 ; CHECK-NEXT: # fallthrough-return %v = load <8 x i8>, <8 x i8>* %p, align 2 ret <8 x i8> %v @@ -316,7 +316,7 @@ ; CHECK: .functype load_ext_v8i16_a4 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 0:p2align=2 +; CHECK-NEXT: v128.load64_zero 0:p2align=2 ; CHECK-NEXT: # fallthrough-return %v = load <8 x i8>, <8 x i8>* %p, align 4 ret <8 x i8> %v @@ -328,7 +328,7 @@ ; CHECK: .functype load_ext_v8i16_a8 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %v = load <8 x i8>, <8 x i8>* %p, align 8 ret <8 x i8> %v @@ -340,7 +340,7 @@ ; CHECK: .functype load_ext_v8i16_a16 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 0 +; CHECK-NEXT: v128.load 0 ; CHECK-NEXT: # fallthrough-return %v = load <8 x i8>, <8 x i8>* %p, align 16 ret <8 x i8> %v @@ -636,7 +636,7 @@ ; CHECK: .functype load_ext_v4i32_a1 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 0:p2align=0 +; CHECK-NEXT: v128.load64_zero 0:p2align=0 ; CHECK-NEXT: # fallthrough-return %v = load <4 x i16>, <4 x i16>* %p, align 1 ret <4 x i16> %v @@ -647,7 +647,7 @@ ; CHECK: .functype load_ext_v4i32_a2 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 0:p2align=1 +; CHECK-NEXT: v128.load64_zero 0:p2align=1 ; CHECK-NEXT: # fallthrough-return %v = load <4 x i16>, <4 x i16>* %p, align 2 ret <4 x i16> %v @@ -658,7 +658,7 @@ ; CHECK: .functype load_ext_v4i32_a4 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 0:p2align=2 +; CHECK-NEXT: v128.load64_zero 0:p2align=2 ; CHECK-NEXT: # fallthrough-return %v = load <4 x i16>, <4 x i16>* %p, align 4 ret <4 x i16> %v @@ -670,7 +670,7 @@ ; CHECK: .functype load_ext_v4i32_a8 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %v = load <4 x i16>, <4 x i16>* %p, align 8 ret <4 x i16> %v @@ -682,7 +682,7 @@ ; CHECK: .functype load_ext_v4i32_a16 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 0 +; CHECK-NEXT: v128.load 0 ; CHECK-NEXT: # fallthrough-return %v = load <4 x i16>, <4 x i16>* %p, align 16 ret <4 x i16> %v diff --git a/llvm/test/CodeGen/WebAssembly/simd-nonconst-sext.ll b/llvm/test/CodeGen/WebAssembly/simd-nonconst-sext.ll deleted file mode 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-nonconst-sext.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mattr=+simd128 | FileCheck %s - -; A regression test for a bug in the lowering of SIGN_EXTEND_INREG -; with SIMD and without sign-ext where ISel would crash if the index -; of the vector extract was not a constant. - -target triple = "wasm32" - -; CHECK-LABEL: foo: -; CHECK-NEXT: .functype foo () -> (f32) -; CHECK: i32x4.load16x4_u -; CHECK: f32.convert_i32_s -define float @foo() { - %1 = load <4 x i16>, <4 x i16>* undef, align 8 - %2 = load i32, i32* undef, align 4 - %vecext = extractelement <4 x i16> %1, i32 %2 - %conv = sitofp i16 %vecext to float - ret float %conv -} diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll --- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll @@ -402,7 +402,7 @@ ; CHECK: .functype load_ext_v8i16 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %v = load <8 x i8>, <8 x i8>* %p ret <8 x i8> %v @@ -473,7 +473,7 @@ ; CHECK: .functype load_ext_v8i16_with_folded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 16 +; CHECK-NEXT: v128.load64_zero 16 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <8 x i8>* %p to i32 %r = add nuw i32 %q, 16 @@ -539,7 +539,7 @@ ; CHECK: .functype load_ext_v8i16_with_folded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.load8x8_u 8 +; CHECK-NEXT: v128.load64_zero 8 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1 %v = load <8 x i8>, <8 x i8>* %s @@ -613,7 +613,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const -8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i16x8.load8x8_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1 %v = load <8 x i8>, <8 x i8>* %s @@ -695,7 +695,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i16x8.load8x8_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <8 x i8>* %p to i32 %r = add nsw i32 %q, 16 @@ -771,7 +771,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i16x8.load8x8_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1 %v = load <8 x i8>, <8 x i8>* %s @@ -835,7 +835,7 @@ ; CHECK: .functype load_ext_v8i16_from_numeric_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: i16x8.load8x8_u 32 +; CHECK-NEXT: v128.load64_zero 32 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 32 to <8 x i8>* %v = load <8 x i8>, <8 x i8>* %s @@ -898,7 +898,7 @@ ; CHECK: .functype load_ext_v8i16_from_global_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: i16x8.load8x8_u gv_v8i8 +; CHECK-NEXT: v128.load64_zero gv_v8i8 ; CHECK-NEXT: # fallthrough-return %v = load <8 x i8>, <8 x i8>* @gv_v8i8 ret <8 x i8> %v @@ -922,13 +922,8 @@ ; CHECK: .functype store_narrowing_v8i16 (v128, i32) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return store <8 x i8> %v, <8 x i8>* %p ret void @@ -954,13 +949,10 @@ ; CHECK: .functype store_narrowing_v8i16_with_folded_offset (v128, i32) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 16 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <8 x i8>* %p to i32 %r = add nuw i32 %q, 16 @@ -987,13 +979,10 @@ ; CHECK: .functype store_narrowing_v8i16_with_folded_gep_offset (v128, i32) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1 store <8 x i8> %v , <8 x i8>* %s @@ -1022,13 +1011,8 @@ ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i32.const -8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1 store <8 x i8> %v , <8 x i8>* %s @@ -1059,13 +1043,8 @@ ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <8 x i8>* %p to i32 %r = add nsw i32 %q, 16 @@ -1096,13 +1075,8 @@ ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i32.const 8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1 store <8 x i8> %v , <8 x i8>* %s @@ -1126,14 +1100,9 @@ ; CHECK-LABEL: store_narrowing_v8i16_to_numeric_address: ; CHECK: .functype store_narrowing_v8i16_to_numeric_address (v128, i32) -> () ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and +; CHECK-NEXT: i32.const 32 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 32 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 32 to <8 x i8>* store <8 x i8> %v , <8 x i8>* %s @@ -1156,14 +1125,9 @@ ; CHECK-LABEL: store_narrowing_v8i16_to_global_address: ; CHECK: .functype store_narrowing_v8i16_to_global_address (v128) -> () ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and +; CHECK-NEXT: i32.const gv_v8i8 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.narrow_i16x8_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store gv_v8i8 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return store <8 x i8> %v , <8 x i8>* @gv_v8i8 ret void @@ -1225,7 +1189,7 @@ ; CHECK: .functype load_ext_v4i32 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %v = load <4 x i16>, <4 x i16>* %p ret <4 x i16> %v @@ -1296,7 +1260,7 @@ ; CHECK: .functype load_ext_v4i32_with_folded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 16 +; CHECK-NEXT: v128.load64_zero 16 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <4 x i16>* %p to i32 %r = add nuw i32 %q, 16 @@ -1362,7 +1326,7 @@ ; CHECK: .functype load_ext_v4i32_with_folded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.load16x4_u 8 +; CHECK-NEXT: v128.load64_zero 8 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1 %v = load <4 x i16>, <4 x i16>* %s @@ -1436,7 +1400,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const -8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i32x4.load16x4_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1 %v = load <4 x i16>, <4 x i16>* %s @@ -1518,7 +1482,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i32x4.load16x4_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <4 x i16>* %p to i32 %r = add nsw i32 %q, 16 @@ -1594,7 +1558,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i32x4.load16x4_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1 %v = load <4 x i16>, <4 x i16>* %s @@ -1658,7 +1622,7 @@ ; CHECK: .functype load_ext_v4i32_from_numeric_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: i32x4.load16x4_u 32 +; CHECK-NEXT: v128.load64_zero 32 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 32 to <4 x i16>* %v = load <4 x i16>, <4 x i16>* %s @@ -1721,7 +1685,7 @@ ; CHECK: .functype load_ext_v4i32_from_global_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: i32x4.load16x4_u gv_v4i16 +; CHECK-NEXT: v128.load64_zero gv_v4i16 ; CHECK-NEXT: # fallthrough-return %v = load <4 x i16>, <4 x i16>* @gv_v4i16 ret <4 x i16> %v @@ -1744,13 +1708,8 @@ ; CHECK: .functype store_narrowing_v4i32 (v128, i32) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return store <4 x i16> %v , <4 x i16>* %p ret void @@ -1776,13 +1735,10 @@ ; CHECK: .functype store_narrowing_v4i32_with_folded_offset (v128, i32) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 16 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <4 x i16>* %p to i32 %r = add nuw i32 %q, 16 @@ -1809,13 +1765,10 @@ ; CHECK: .functype store_narrowing_v4i32_with_folded_gep_offset (v128, i32) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1 store <4 x i16> %v , <4 x i16>* %s @@ -1844,13 +1797,8 @@ ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i32.const -8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1 store <4 x i16> %v , <4 x i16>* %s @@ -1881,13 +1829,8 @@ ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <4 x i16>* %p to i32 %r = add nsw i32 %q, 16 @@ -1918,13 +1861,8 @@ ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i32.const 8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1 store <4 x i16> %v , <4 x i16>* %s @@ -1948,14 +1886,9 @@ ; CHECK-LABEL: store_narrowing_v4i32_to_numeric_address: ; CHECK: .functype store_narrowing_v4i32_to_numeric_address (v128) -> () ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: i32.const 32 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store 32 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 32 to <4 x i16>* store <4 x i16> %v , <4 x i16>* %s @@ -1978,14 +1911,9 @@ ; CHECK-LABEL: store_narrowing_v4i32_to_global_address: ; CHECK: .functype store_narrowing_v4i32_to_global_address (v128) -> () ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: i32.const gv_v4i16 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.and -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.narrow_i32x4_u -; CHECK-NEXT: i64x2.extract_lane 0 -; CHECK-NEXT: i64.store gv_v4i16 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return store <4 x i16> %v , <4 x i16>* @gv_v4i16 ret void @@ -2047,7 +1975,7 @@ ; CHECK: .functype load_ext_v2i64 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i64x2.load32x2_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %v = load <2 x i32>, <2 x i32>* %p ret <2 x i32> %v @@ -2118,7 +2046,7 @@ ; CHECK: .functype load_ext_v2i64_with_folded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i64x2.load32x2_u 16 +; CHECK-NEXT: v128.load64_zero 16 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <2 x i32>* %p to i32 %r = add nuw i32 %q, 16 @@ -2184,7 +2112,7 @@ ; CHECK: .functype load_ext_v2i64_with_folded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i64x2.load32x2_u 8 +; CHECK-NEXT: v128.load64_zero 8 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 1 %v = load <2 x i32>, <2 x i32>* %s @@ -2258,7 +2186,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const -8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i64x2.load32x2_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 -1 %v = load <2 x i32>, <2 x i32>* %s @@ -2340,7 +2268,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i64x2.load32x2_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint <2 x i32>* %p to i32 %r = add nsw i32 %q, 16 @@ -2416,7 +2344,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 8 ; CHECK-NEXT: i32.add -; CHECK-NEXT: i64x2.load32x2_u 0 +; CHECK-NEXT: v128.load64_zero 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr <2 x i32>, <2 x i32>* %p, i32 1 %v = load <2 x i32>, <2 x i32>* %s @@ -2480,7 +2408,7 @@ ; CHECK: .functype load_ext_v2i64_from_numeric_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: i64x2.load32x2_u 32 +; CHECK-NEXT: v128.load64_zero 32 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 32 to <2 x i32>* %v = load <2 x i32>, <2 x i32>* %s @@ -2543,7 +2471,7 @@ ; CHECK: .functype load_ext_v2i64_from_global_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 -; CHECK-NEXT: i64x2.load32x2_u gv_v2i32 +; CHECK-NEXT: v128.load64_zero gv_v2i32 ; CHECK-NEXT: # fallthrough-return %v = load <2 x i32>, <2 x i32>* @gv_v2i32 ret <2 x i32> %v diff --git a/llvm/test/CodeGen/WebAssembly/simd-scalar-to-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-scalar-to-vector.ll deleted file mode 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-scalar-to-vector.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s - -; Test that scalar_to_vector is lowered into a splat correctly. -; This bugpoint-reduced code turns into the selection dag below. -; TODO: find small test cases that produce scalar_to_vector dag nodes -; to make this test more readable and comprehensive. - -; t0: ch = EntryToken -; t32: i32,ch = load<(load 4 from `<2 x i16>* undef`, align 1)> t0, undef:i32, undef:i32 -; t33: v4i32 = scalar_to_vector t32 -; t34: v8i16 = bitcast t33 -; t51: i32 = extract_vector_elt t34, Constant:i32<0> -; t52: ch = store<(store 2 into `<4 x i16>* undef`, align 1), trunc to i16> t32:1, t51, undef:i32, undef:i32 -; t50: i32 = extract_vector_elt t34, Constant:i32<1> -; t53: ch = store<(store 2 into `<4 x i16>* undef` + 2, align 1), trunc to i16> t32:1, t50, undef:i32, undef:i32 -; t49: i32 = extract_vector_elt t34, Constant:i32<2> -; t55: ch = store<(store 2 into `<4 x i16>* undef` + 4, align 1), trunc to i16> t32:1, t49, undef:i32, undef:i32 -; t48: i32 = extract_vector_elt t34, Constant:i32<3> -; t57: ch = store<(store 2 into `<4 x i16>* undef` + 6, align 1), trunc to i16> t32:1, t48, undef:i32, undef:i32 -; t58: ch = TokenFactor t52, t53, t55, t57 -; t24: ch = WebAssemblyISD::RETURN t58 - -target triple = "wasm32-unknown-unknown" - -; CHECK-LABEL: foo: -; CHECK: i64x2.splat -define void @foo() { -entry: - %a = load <2 x i16>, <2 x i16>* undef, align 1 - %b = shufflevector <2 x i16> %a, <2 x i16> undef, <8 x i32> - %0 = bitcast <8 x i16> %b to <16 x i8> - %shuffle.i214 = shufflevector <16 x i8> %0, <16 x i8> , <16 x i32> - %1 = bitcast <16 x i8> %shuffle.i214 to <8 x i16> - %add82 = add <8 x i16> %1, zeroinitializer - %2 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> %add82 - %3 = bitcast <8 x i16> %2 to <16 x i8> - %shuffle.i204 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> - %4 = bitcast <16 x i8> %shuffle.i204 to <8 x i16> - %dst2.0.vec.extract = shufflevector <8 x i16> %4, <8 x i16> undef, <4 x i32> - store <4 x i16> %dst2.0.vec.extract, <4 x i16>* undef, align 1 - ret void -}