diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -169,14 +169,5 @@ TARGET_BUILTIN(__builtin_wasm_narrow_s_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_s_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_s_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_u_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_u_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_s_i32x4_i16x8, "V4iV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_s_i32x4_i16x8, "V4iV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_u_i32x4_i16x8, "V4iV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_u_i32x4_i16x8, "V4iV8s", "nc", "simd128") - #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16528,40 +16528,6 @@ CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()}); return Builder.CreateCall(Callee, {Low, High}); } - case WebAssembly::BI__builtin_wasm_widen_low_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_s_i32x4_i16x8: - case WebAssembly::BI__builtin_wasm_widen_high_s_i32x4_i16x8: - case WebAssembly::BI__builtin_wasm_widen_low_u_i32x4_i16x8: - case WebAssembly::BI__builtin_wasm_widen_high_u_i32x4_i16x8: { - Value *Vec = EmitScalarExpr(E->getArg(0)); - unsigned IntNo; - switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_widen_low_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_s_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_low_signed; - break; - case WebAssembly::BI__builtin_wasm_widen_high_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_s_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_high_signed; - break; - case WebAssembly::BI__builtin_wasm_widen_low_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_u_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_low_unsigned; - break; - case WebAssembly::BI__builtin_wasm_widen_high_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_u_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_high_unsigned; - break; - default: - llvm_unreachable("unexpected builtin ID"); - } - Function *Callee = - CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()}); - return Builder.CreateCall(Callee, Vec); - } case WebAssembly::BI__builtin_wasm_shuffle_v8x16: { Value *Ops[18]; size_t OpIdx = 0; diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -35,6 +35,13 @@ typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16))); typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16))); +typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8))); +typedef unsigned char __u8x8 + __attribute__((__vector_size__(8), __aligned__(8))); +typedef short __i16x4 __attribute__((__vector_size__(8), __aligned__(8))); +typedef unsigned short __u16x4 + __attribute__((__vector_size__(8), __aligned__(8))); + #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("simd128"), \ __min_vector_width__(128))) @@ -1089,42 +1096,70 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_low_i8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_s_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__i8x8){((__i8x16)__a)[0], ((__i8x16)__a)[1], ((__i8x16)__a)[2], + ((__i8x16)__a)[3], ((__i8x16)__a)[4], ((__i8x16)__a)[5], + ((__i8x16)__a)[6], ((__i8x16)__a)[7]}, + __i16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_high_i8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_s_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__i8x8){((__i8x16)__a)[8], ((__i8x16)__a)[9], ((__i8x16)__a)[10], + ((__i8x16)__a)[11], ((__i8x16)__a)[12], ((__i8x16)__a)[13], + ((__i8x16)__a)[14], ((__i8x16)__a)[15]}, + __i16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_low_u8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_u_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__u8x8){((__u8x16)__a)[0], ((__u8x16)__a)[1], ((__u8x16)__a)[2], + ((__u8x16)__a)[3], ((__u8x16)__a)[4], ((__u8x16)__a)[5], + ((__u8x16)__a)[6], ((__u8x16)__a)[7]}, + __u16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_high_u8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_u_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__u8x8){((__u8x16)__a)[8], ((__u8x16)__a)[9], ((__u8x16)__a)[10], + ((__u8x16)__a)[11], ((__u8x16)__a)[12], ((__u8x16)__a)[13], + ((__u8x16)__a)[14], ((__u8x16)__a)[15]}, + __u16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_low_i16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_s_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__i16x4){((__i16x8)__a)[0], ((__i16x8)__a)[1], ((__i16x8)__a)[2], + ((__i16x8)__a)[3]}, + __i32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_high_i16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_s_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__i16x4){((__i16x8)__a)[4], ((__i16x8)__a)[5], ((__i16x8)__a)[6], + ((__i16x8)__a)[7]}, + __i32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_low_u16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_u_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__u16x4){((__u16x8)__a)[0], ((__u16x8)__a)[1], ((__u16x8)__a)[2], + ((__u16x8)__a)[3]}, + __u32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_high_u16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_u_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__u16x4){((__u16x8)__a)[4], ((__u16x8)__a)[5], ((__u16x8)__a)[6], + ((__u16x8)__a)[7]}, + __u32x4); } // Undefine helper macros diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -737,54 +737,6 @@ // WEBASSEMBLY: ret } -i16x8 widen_low_s_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_low_s_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.low.signed.v8i16.v16i8(<16 x i8> %v) - // WEBASSEMBLY: ret -} - -i16x8 widen_high_s_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_high_s_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.high.signed.v8i16.v16i8(<16 x i8> %v) - // WEBASSEMBLY: ret -} - -i16x8 widen_low_u_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_low_u_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.low.unsigned.v8i16.v16i8(<16 x i8> %v) - // WEBASSEMBLY: ret -} - -i16x8 widen_high_u_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_high_u_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.high.unsigned.v8i16.v16i8(<16 x i8> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_low_s_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_low_s_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.low.signed.v4i32.v8i16(<8 x i16> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_high_s_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_high_s_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.high.signed.v4i32.v8i16(<8 x i16> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_low_u_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_low_u_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.low.unsigned.v4i32.v8i16(<8 x i16> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_high_u_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_high_u_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.high.unsigned.v4i32.v8i16(<8 x i16> %v) - // WEBASSEMBLY: ret -} - i8x16 swizzle_v8x16(i8x16 x, i8x16 y) { return __builtin_wasm_swizzle_v8x16(x, y); // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y) diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -159,22 +159,6 @@ Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_low_signed : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_high_signed : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_low_unsigned : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_high_unsigned : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; // TODO: Replace these intrinsics with normal ISel patterns def int_wasm_pmin : diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -29,6 +29,10 @@ HANDLE_NODETYPE(VEC_SHL) HANDLE_NODETYPE(VEC_SHR_S) HANDLE_NODETYPE(VEC_SHR_U) +HANDLE_NODETYPE(WIDEN_LOW_S) +HANDLE_NODETYPE(WIDEN_LOW_U) +HANDLE_NODETYPE(WIDEN_HIGH_S) +HANDLE_NODETYPE(WIDEN_HIGH_U) HANDLE_NODETYPE(THROW) HANDLE_NODETYPE(MEMORY_COPY) HANDLE_NODETYPE(MEMORY_FILL) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -123,6 +123,10 @@ // Hoist bitcasts out of shuffles setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + // Combine extends of extract_subvectors into widening ops + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + // Support saturating add for i8x16 and i16x8 for (auto Op : {ISD::SADDSAT, ISD::UADDSAT}) for (auto T : {MVT::v16i8, MVT::v8i16}) @@ -1745,6 +1749,49 @@ return DAG.getBitcast(DstType, NewShuffle); } +static SDValue performVectorWidenCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + assert(N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND); + + // Combine ({s,z}ext (extract_subvector src, i)) into a widening operation if + // possible before the extract_subvector can be expanded. + auto Extract = N->getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + auto Source = Extract.getOperand(0); + auto *IndexNode = dyn_cast(Extract.getOperand(1)); + if (IndexNode == nullptr) + return SDValue(); + auto Index = IndexNode->getZExtValue(); + + // Only v8i8 and v4i16 extracts can be widened, and only if the extracted + // subvector is the low or high half of its source. + EVT ResVT = N->getValueType(0); + if (ResVT == MVT::v8i16) { + if (Extract.getValueType() != MVT::v8i8 || + Source.getValueType() != MVT::v16i8 || (Index != 0 && Index != 8)) + return SDValue(); + } else if (ResVT == MVT::v4i32) { + if (Extract.getValueType() != MVT::v4i16 || + Source.getValueType() != MVT::v8i16 || (Index != 0 && Index != 4)) + return SDValue(); + } else { + return SDValue(); + } + + bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND; + bool IsLow = Index == 0; + + unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::WIDEN_LOW_S + : WebAssemblyISD::WIDEN_HIGH_S) + : (IsLow ? WebAssemblyISD::WIDEN_LOW_U + : WebAssemblyISD::WIDEN_HIGH_U); + + return DAG.getNode(Op, SDLoc(N), ResVT, Source); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -1753,5 +1800,8 @@ return SDValue(); case ISD::VECTOR_SHUFFLE: return performVECTOR_SHUFFLECombine(N, DCI); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return performVectorWidenCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -892,15 +892,21 @@ (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>; // Widening operations +def widen_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; +def widen_low_s : SDNode<"WebAssemblyISD::WIDEN_LOW_S", widen_t>; +def widen_high_s : SDNode<"WebAssemblyISD::WIDEN_HIGH_S", widen_t>; +def widen_low_u : SDNode<"WebAssemblyISD::WIDEN_LOW_U", widen_t>; +def widen_high_u : SDNode<"WebAssemblyISD::WIDEN_HIGH_U", widen_t>; + multiclass SIMDWiden baseInst> { - defm "" : SIMDConvert; - defm "" : SIMDConvert; - defm "" : SIMDConvert; - defm "" : SIMDConvert; } diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -294,46 +294,6 @@ ret <8 x i16> %a } -; CHECK-LABEL: widen_low_signed_v8i16: -; SIMD128-NEXT: .functype widen_low_signed_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_low_i8x16_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.low.signed.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_low_signed_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.low.signed.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - -; CHECK-LABEL: widen_high_signed_v8i16: -; SIMD128-NEXT: .functype widen_high_signed_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_high_i8x16_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.high.signed.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_high_signed_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.high.signed.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - -; CHECK-LABEL: widen_low_unsigned_v8i16: -; SIMD128-NEXT: .functype widen_low_unsigned_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_low_i8x16_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.low.unsigned.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_low_unsigned_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.low.unsigned.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - -; CHECK-LABEL: widen_high_unsigned_v8i16: -; SIMD128-NEXT: .functype widen_high_unsigned_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_high_i8x16_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.high.unsigned.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_high_unsigned_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.high.unsigned.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - ; ============================================================================== ; 4 x i32 ; ============================================================================== @@ -411,46 +371,6 @@ ret <4 x i32> %a } -; CHECK-LABEL: widen_low_signed_v4i32: -; SIMD128-NEXT: .functype widen_low_signed_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_low_i16x8_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.low.signed.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_low_signed_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.low.signed.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - -; CHECK-LABEL: widen_high_signed_v4i32: -; SIMD128-NEXT: .functype widen_high_signed_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_high_i16x8_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.high.signed.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_high_signed_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.high.signed.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - -; CHECK-LABEL: widen_low_unsigned_v4i32: -; SIMD128-NEXT: .functype widen_low_unsigned_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_low_i16x8_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.low.unsigned.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_low_unsigned_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.low.unsigned.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - -; CHECK-LABEL: widen_high_unsigned_v4i32: -; SIMD128-NEXT: .functype widen_high_unsigned_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_high_i16x8_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.high.unsigned.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_high_unsigned_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.high.unsigned.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - ; ============================================================================== ; 2 x i64 ; ============================================================================== diff --git a/llvm/test/CodeGen/WebAssembly/simd-widening.ll b/llvm/test/CodeGen/WebAssembly/simd-widening.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-widening.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mattr=+simd128 | FileCheck %s + +;; Test that SIMD widening operations can be successfully selected + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +define <8 x i16> @widen_low_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: widen_low_i8x16_s: +; CHECK: .functype widen_low_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_low_i8x16_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = sext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <8 x i16> @widen_low_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: widen_low_i8x16_u: +; CHECK: .functype widen_low_i8x16_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_low_i8x16_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = zext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <8 x i16> @widen_high_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: widen_high_i8x16_s: +; CHECK: .functype widen_high_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_high_i8x16_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = sext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <8 x i16> @widen_high_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: widen_high_i8x16_u: +; CHECK: .functype widen_high_i8x16_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_high_i8x16_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = zext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <4 x i32> @widen_low_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: widen_low_i16x8_s: +; CHECK: .functype widen_low_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_low_i16x8_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = sext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +define <4 x i32> @widen_low_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: widen_low_i16x8_u: +; CHECK: .functype widen_low_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_low_i16x8_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = zext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +define <4 x i32> @widen_high_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: widen_high_i16x8_s: +; CHECK: .functype widen_high_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_high_i16x8_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = sext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +define <4 x i32> @widen_high_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: widen_high_i16x8_u: +; CHECK: .functype widen_high_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_high_i16x8_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = zext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +;; Also test that similar patterns with offsets not corresponding to +;; the low or high half are correctly expanded. + +define <8 x i16> @widen_lowish_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: widen_lowish_i8x16_s: +; CHECK: .functype widen_lowish_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 1 +; CHECK-NEXT: i16x8.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 2 +; CHECK-NEXT: i16x8.replace_lane 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 3 +; CHECK-NEXT: i16x8.replace_lane 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 4 +; CHECK-NEXT: i16x8.replace_lane 3 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 5 +; CHECK-NEXT: i16x8.replace_lane 4 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 6 +; CHECK-NEXT: i16x8.replace_lane 5 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 7 +; CHECK-NEXT: i16x8.replace_lane 6 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 8 +; CHECK-NEXT: i16x8.replace_lane 7 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i16x8.shl +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i16x8.shr_s +; CHECK-NEXT: # fallthrough-return + %lowish = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = sext <8 x i8> %lowish to <8 x i16> + ret <8 x i16> %widened +} + +define <4 x i32> @widen_lowish_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: widen_lowish_i16x8_s: +; CHECK: .functype widen_lowish_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 1 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 2 +; CHECK-NEXT: i32x4.replace_lane 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 3 +; CHECK-NEXT: i32x4.replace_lane 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 4 +; CHECK-NEXT: i32x4.replace_lane 3 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %lowish = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = sext <4 x i16> %lowish to <4 x i32> + ret <4 x i32> %widened +}