diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -195,10 +195,6 @@ TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4iiC*", "n", "simd128") TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLiC*", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_load8_lane, "V16ScScC*V16ScIi", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_load16_lane, "V8ssC*V8sIi", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_load32_lane, "V4iiC*V4iIi", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_load64_lane, "V2LLiLLiC*V2LLiIi", "n", "simd128") TARGET_BUILTIN(__builtin_wasm_store8_lane, "vSc*V16ScIi", "n", "simd128") TARGET_BUILTIN(__builtin_wasm_store16_lane, "vs*V8sIi", "n", "simd128") TARGET_BUILTIN(__builtin_wasm_store32_lane, "vi*V4iIi", "n", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17776,10 +17776,6 @@ Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load64_zero); return Builder.CreateCall(Callee, {Ptr}); } - case WebAssembly::BI__builtin_wasm_load8_lane: - case WebAssembly::BI__builtin_wasm_load16_lane: - case WebAssembly::BI__builtin_wasm_load32_lane: - case WebAssembly::BI__builtin_wasm_load64_lane: case WebAssembly::BI__builtin_wasm_store8_lane: case WebAssembly::BI__builtin_wasm_store16_lane: case WebAssembly::BI__builtin_wasm_store32_lane: @@ -17792,18 +17788,6 @@ Value *LaneIdx = llvm::ConstantInt::get(getLLVMContext(), *LaneIdxConst); unsigned IntNo; switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_load8_lane: - IntNo = Intrinsic::wasm_load8_lane; - break; - case WebAssembly::BI__builtin_wasm_load16_lane: - IntNo = Intrinsic::wasm_load16_lane; - break; - case WebAssembly::BI__builtin_wasm_load32_lane: - IntNo = Intrinsic::wasm_load32_lane; - break; - case WebAssembly::BI__builtin_wasm_load64_lane: - IntNo = Intrinsic::wasm_load64_lane; - break; case WebAssembly::BI__builtin_wasm_store8_lane: IntNo = Intrinsic::wasm_store8_lane; break; diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -170,21 +170,49 @@ return (v128_t)(__i64x2){__v, 0}; } -#define wasm_v128_load8_lane(__ptr, __vec, __i) \ - ((v128_t)__builtin_wasm_load8_lane((const signed char *)(__ptr), \ - (__i8x16)(__vec), (__i))) +static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_v128_load8_lane( + const void *__mem, v128_t __vec, int __i) __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_load8_lane_struct { + int8_t __v; + } __attribute__((__packed__, __may_alias__)); + int8_t __v = ((const struct __wasm_v128_load8_lane_struct *)__mem)->__v; + __i8x16 __ret = (__i8x16)__vec; + __ret[__i] = __v; + return (v128_t)__ret; +} -#define wasm_v128_load16_lane(__ptr, __vec, __i) \ - ((v128_t)__builtin_wasm_load16_lane((const short *)(__ptr), \ - (__i16x8)(__vec), (__i))) +static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_v128_load16_lane( + const void *__mem, v128_t __vec, int __i) __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_load16_lane_struct { + int16_t __v; + } __attribute__((__packed__, __may_alias__)); + int16_t __v = ((const struct __wasm_v128_load16_lane_struct *)__mem)->__v; + __i16x8 __ret = (__i16x8)__vec; + __ret[__i] = __v; + return (v128_t)__ret; +} -#define wasm_v128_load32_lane(__ptr, __vec, __i) \ - ((v128_t)__builtin_wasm_load32_lane((const int *)(__ptr), (__i32x4)(__vec), \ - (__i))) +static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_v128_load32_lane( + const void *__mem, v128_t __vec, int __i) __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_load32_lane_struct { + int32_t __v; + } __attribute__((__packed__, __may_alias__)); + int32_t __v = ((const struct __wasm_v128_load32_lane_struct *)__mem)->__v; + __i32x4 __ret = (__i32x4)__vec; + __ret[__i] = __v; + return (v128_t)__ret; +} -#define wasm_v128_load64_lane(__ptr, __vec, __i) \ - ((v128_t)__builtin_wasm_load64_lane((const long long int *)(__ptr), \ - (__i64x2)(__vec), (__i))) +static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_v128_load64_lane( + const void *__mem, v128_t __vec, int __i) __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_load64_lane_struct { + int64_t __v; + } __attribute__((__packed__, __may_alias__)); + int64_t __v = ((const struct __wasm_v128_load64_lane_struct *)__mem)->__v; + __i64x2 __ret = (__i64x2)__vec; + __ret[__i] = __v; + return (v128_t)__ret; +} static __inline__ void __DEFAULT_FN_ATTRS wasm_v128_store(void *__mem, v128_t __a) { diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -284,34 +284,6 @@ // WEBASSEMBLY-NEXT: ret } -i8x16 load8_lane(const signed char *p, i8x16 v) { - return __builtin_wasm_load8_lane(p, v, 0); - // WEBASSEMBLY: tail call <16 x i8> @llvm.wasm.load8.lane( - // WEBASSEMBLY-SAME: i8* %p, <16 x i8> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - -i16x8 load16_lane(const short *p, i16x8 v) { - return __builtin_wasm_load16_lane(p, v, 0); - // WEBASSEMBLY: tail call <8 x i16> @llvm.wasm.load16.lane( - // WEBASSEMBLY-SAME: i16* %p, <8 x i16> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - -i32x4 load32_lane(const int *p, i32x4 v) { - return __builtin_wasm_load32_lane(p, v, 0); - // WEBASSEMBLY: tail call <4 x i32> @llvm.wasm.load32.lane( - // WEBASSEMBLY-SAME: i32* %p, <4 x i32> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - -i64x2 load64_lane(const long long *p, i64x2 v) { - return __builtin_wasm_load64_lane(p, v, 0); - // WEBASSEMBLY: tail call <2 x i64> @llvm.wasm.load64.lane( - // WEBASSEMBLY-SAME: i64* %p, <2 x i64> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - void store8_lane(signed char *p, i8x16 v) { __builtin_wasm_store8_lane(p, v, 0); // WEBASSEMBLY: call void @llvm.wasm.store8.lane( diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c --- a/clang/test/Headers/wasm.c +++ b/clang/test/Headers/wasm.c @@ -162,9 +162,10 @@ // CHECK-LABEL: @test_v128_load8_lane( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.wasm.load8.lane(i8* [[PTR:%.*]], <16 x i8> [[TMP0]], i32 15) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <16 x i8> +// CHECK-NEXT: [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VECINS_I]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_v128_load8_lane(const uint8_t *ptr, v128_t vec) { @@ -173,9 +174,10 @@ // CHECK-LABEL: @test_v128_load16_lane( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.load16.lane(i16* [[PTR:%.*]], <8 x i16> [[TMP0]], i32 7) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <8 x i16> +// CHECK-NEXT: [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP0]], i32 7 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[VECINS_I]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_v128_load16_lane(const uint16_t *ptr, v128_t vec) { @@ -184,8 +186,9 @@ // CHECK-LABEL: @test_v128_load32_lane( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.load32.lane(i32* [[PTR:%.*]], <4 x i32> [[VEC:%.*]], i32 3) -// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] +// CHECK-NEXT: [[VECINS_I:%.*]] = insertelement <4 x i32> [[VEC:%.*]], i32 [[TMP0]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VECINS_I]] // v128_t test_v128_load32_lane(const uint32_t *ptr, v128_t vec) { return wasm_v128_load32_lane(ptr, vec, 3); @@ -193,9 +196,10 @@ // CHECK-LABEL: @test_v128_load64_lane( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.wasm.load64.lane(i64* [[PTR:%.*]], <2 x i64> [[TMP0]], i32 1) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <2 x i64> +// CHECK-NEXT: [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP0]], i32 1 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[VECINS_I]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_v128_load64_lane(const uint64_t *ptr, v128_t vec) { @@ -1281,7 +1285,7 @@ // CHECK-LABEL: @test_v128_any_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR10:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR8:[0-9]+]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1291,7 +1295,7 @@ // CHECK-LABEL: @test_v128_bitselect( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP0]] // v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) { @@ -1301,7 +1305,7 @@ // CHECK-LABEL: @test_i8x16_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1323,7 +1327,7 @@ // CHECK-LABEL: @test_i8x16_all_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1334,7 +1338,7 @@ // CHECK-LABEL: @test_i8x16_bitmask( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: ret i32 [[TMP1]] // int32_t test_i8x16_bitmask(v128_t a) { @@ -1344,7 +1348,7 @@ // CHECK-LABEL: @test_i8x16_popcnt( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.wasm.popcnt(<16 x i8> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.wasm.popcnt(<16 x i8> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1410,7 +1414,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1422,7 +1426,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1446,7 +1450,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1458,7 +1462,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1522,7 +1526,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1533,7 +1537,7 @@ // CHECK-LABEL: @test_i16x8_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1555,7 +1559,7 @@ // CHECK-LABEL: @test_i16x8_all_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1566,7 +1570,7 @@ // CHECK-LABEL: @test_i16x8_bitmask( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: ret i32 [[TMP1]] // int32_t test_i16x8_bitmask(v128_t a) { @@ -1631,7 +1635,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1643,7 +1647,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1667,7 +1671,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1679,7 +1683,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1755,7 +1759,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1765,7 +1769,7 @@ // CHECK-LABEL: @test_i32x4_abs( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP0]] // v128_t test_i32x4_abs(v128_t a) { @@ -1783,7 +1787,7 @@ // CHECK-LABEL: @test_i32x4_all_true( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR8]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP0]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1793,7 +1797,7 @@ // CHECK-LABEL: @test_i32x4_bitmask( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR8]] // CHECK-NEXT: ret i32 [[TMP0]] // int32_t test_i32x4_bitmask(v128_t a) { @@ -1904,7 +1908,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) { @@ -1914,7 +1918,7 @@ // CHECK-LABEL: @test_i64x2_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1936,7 +1940,7 @@ // CHECK-LABEL: @test_i64x2_all_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1947,7 +1951,7 @@ // CHECK-LABEL: @test_i64x2_bitmask( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: ret i32 [[TMP1]] // int32_t test_i64x2_bitmask(v128_t a) { @@ -2035,7 +2039,7 @@ // CHECK-LABEL: @test_f32x4_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2057,7 +2061,7 @@ // CHECK-LABEL: @test_f32x4_sqrt( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2068,7 +2072,7 @@ // CHECK-LABEL: @test_f32x4_ceil( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2079,7 +2083,7 @@ // CHECK-LABEL: @test_f32x4_floor( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2090,7 +2094,7 @@ // CHECK-LABEL: @test_f32x4_trunc( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2101,7 +2105,7 @@ // CHECK-LABEL: @test_f32x4_nearest( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2161,7 +2165,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2173,7 +2177,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2185,7 +2189,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2197,7 +2201,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2208,7 +2212,7 @@ // CHECK-LABEL: @test_f64x2_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2230,7 +2234,7 @@ // CHECK-LABEL: @test_f64x2_sqrt( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2241,7 +2245,7 @@ // CHECK-LABEL: @test_f64x2_ceil( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2252,7 +2256,7 @@ // CHECK-LABEL: @test_f64x2_floor( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2263,7 +2267,7 @@ // CHECK-LABEL: @test_f64x2_trunc( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2274,7 +2278,7 @@ // CHECK-LABEL: @test_f64x2_nearest( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2334,7 +2338,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2346,7 +2350,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2358,7 +2362,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2370,7 +2374,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2381,7 +2385,7 @@ // CHECK-LABEL: @test_i32x4_trunc_sat_f32x4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_i32x4_trunc_sat_f32x4(v128_t a) { @@ -2391,7 +2395,7 @@ // CHECK-LABEL: @test_u32x4_trunc_sat_f32x4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_u32x4_trunc_sat_f32x4(v128_t a) { @@ -2443,7 +2447,7 @@ // CHECK-LABEL: @test_i32x4_trunc_sat_f64x2_zero( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2454,7 +2458,7 @@ // CHECK-LABEL: @test_u32x4_trunc_sat_f64x2_zero( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2538,7 +2542,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2550,7 +2554,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2562,7 +2566,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2572,7 +2576,7 @@ // CHECK-LABEL: @test_i16x8_narrow_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2582,7 +2586,7 @@ // CHECK-LABEL: @test_u16x8_narrow_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2729,7 +2733,7 @@ // CHECK-LABEL: @test_i16x8_extadd_pairwise_i8x16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2740,7 +2744,7 @@ // CHECK-LABEL: @test_u16x8_extadd_pairwise_u8x16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2751,7 +2755,7 @@ // CHECK-LABEL: @test_i32x4_extadd_pairwise_i16x8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) { @@ -2761,7 +2765,7 @@ // CHECK-LABEL: @test_u32x4_extadd_pairwise_u16x8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_u32x4_extadd_pairwise_u16x8(v128_t a) { @@ -2772,7 +2776,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2784,7 +2788,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2796,7 +2800,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2808,7 +2812,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2820,7 +2824,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_i32x4_extmul_low_i16x8(v128_t a, v128_t b) { @@ -2831,7 +2835,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_i32x4_extmul_high_i16x8(v128_t a, v128_t b) { @@ -2842,7 +2846,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_u32x4_extmul_low_u16x8(v128_t a, v128_t b) { @@ -2853,7 +2857,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_u32x4_extmul_high_u16x8(v128_t a, v128_t b) { @@ -2862,7 +2866,7 @@ // CHECK-LABEL: @test_i64x2_extmul_low_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2872,7 +2876,7 @@ // CHECK-LABEL: @test_i64x2_extmul_high_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2882,7 +2886,7 @@ // CHECK-LABEL: @test_u64x2_extmul_low_u32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2892,7 +2896,7 @@ // CHECK-LABEL: @test_u64x2_extmul_high_u32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2904,7 +2908,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR10]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -191,26 +191,6 @@ // ISD::TargetConstant, which would require extra complications in the ISel // tablegen patterns. TODO: Replace these intrinsic with normal ISel patterns // once the load_lane instructions are merged to the proposal. -def int_wasm_load8_lane : - Intrinsic<[llvm_v16i8_ty], - [LLVMPointerType, llvm_v16i8_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; -def int_wasm_load16_lane : - Intrinsic<[llvm_v8i16_ty], - [LLVMPointerType, llvm_v8i16_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; -def int_wasm_load32_lane : - Intrinsic<[llvm_v4i32_ty], - [LLVMPointerType, llvm_v4i32_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; -def int_wasm_load64_lane : - Intrinsic<[llvm_v2i64_ty], - [LLVMPointerType, llvm_v2i64_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; def int_wasm_store8_lane : Intrinsic<[], [LLVMPointerType, llvm_v16i8_ty, llvm_i32_ty], diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -767,49 +767,33 @@ Info.align = Align(1); Info.flags = MachineMemOperand::MOLoad; return true; - case Intrinsic::wasm_load8_lane: - case Intrinsic::wasm_load16_lane: - case Intrinsic::wasm_load32_lane: - case Intrinsic::wasm_load64_lane: case Intrinsic::wasm_store8_lane: case Intrinsic::wasm_store16_lane: case Intrinsic::wasm_store32_lane: case Intrinsic::wasm_store64_lane: { MVT MemVT; switch (Intrinsic) { - case Intrinsic::wasm_load8_lane: case Intrinsic::wasm_store8_lane: MemVT = MVT::i8; break; - case Intrinsic::wasm_load16_lane: case Intrinsic::wasm_store16_lane: MemVT = MVT::i16; break; - case Intrinsic::wasm_load32_lane: case Intrinsic::wasm_store32_lane: MemVT = MVT::i32; break; - case Intrinsic::wasm_load64_lane: case Intrinsic::wasm_store64_lane: MemVT = MVT::i64; break; default: llvm_unreachable("unexpected intrinsic"); } - if (Intrinsic == Intrinsic::wasm_load8_lane || - Intrinsic == Intrinsic::wasm_load16_lane || - Intrinsic == Intrinsic::wasm_load32_lane || - Intrinsic == Intrinsic::wasm_load64_lane) { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.flags = MachineMemOperand::MOLoad; - } else { - Info.opc = ISD::INTRINSIC_VOID; - Info.flags = MachineMemOperand::MOStore; - } - Info.ptrVal = I.getArgOperand(0); + Info.opc = ISD::INTRINSIC_VOID; Info.memVT = MemVT; + Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = Align(1); + Info.flags = MachineMemOperand::MOStore; return true; } default: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -300,8 +300,6 @@ } // mayLoad = 1, UseNamedOperandTable = 1 } -// TODO: Also support v4f32 and v2f64 once the instructions are merged -// to the proposal defm "" : SIMDLoadLane; defm "" : SIMDLoadLane; defm "" : SIMDLoadLane; @@ -321,10 +319,24 @@ Requires<[HasAddr64]>; } -defm : LoadLanePatNoOffset; -defm : LoadLanePatNoOffset; -defm : LoadLanePatNoOffset; -defm : LoadLanePatNoOffset; +def load8_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (vector_insert $vec, (i32 (extloadi8 $ptr)), $idx)>; +def load16_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (vector_insert $vec, (i32 (extloadi16 $ptr)), $idx)>; +def load32_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (vector_insert $vec, (i32 (load $ptr)), $idx)>; +def load64_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (vector_insert $vec, (i64 (load $ptr)), $idx)>; +// TODO: floating point lanes as well + +defm : LoadLanePatNoOffset; +defm : LoadLanePatNoOffset; +defm : LoadLanePatNoOffset; +defm : LoadLanePatNoOffset; // TODO: Also support the other load patterns for load_lane once the instructions // are merged to the proposal. diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -211,7 +211,7 @@ ; CHECK-LABEL: mashup_const_i8x16: ; CHECK-NEXT: .functype mashup_const_i8x16 (v128, v128, i32) -> (v128) ; CHECK: v128.const $push[[L0:[0-9]+]]=, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0 -; CHECK: i8x16.replace_lane +; CHECK: v128.load8_lane ; CHECK: i8x16.replace_lane ; CHECK: i8x16.replace_lane ; CHECK: return @@ -234,7 +234,7 @@ ; CHECK-LABEL: mashup_splat_i8x16: ; CHECK-NEXT: .functype mashup_splat_i8x16 (v128, v128, i32) -> (v128) ; CHECK: i8x16.splat $push[[L0:[0-9]+]]=, $2 -; CHECK: i8x16.replace_lane +; CHECK: v128.load8_lane ; CHECK: i8x16.replace_lane ; CHECK: return define <16 x i8> @mashup_splat_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %splatted) { diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll --- a/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll @@ -8,11 +8,6 @@ target triple = "wasm32-unknown-unknown" -declare <16 x i8> @llvm.wasm.load8.lane(i8*, <16 x i8>, i32) -declare <8 x i16> @llvm.wasm.load16.lane(i16*, <8 x i16>, i32) -declare <4 x i32> @llvm.wasm.load32.lane(i32*, <4 x i32>, i32) -declare <2 x i64> @llvm.wasm.load64.lane(i64*, <2 x i64>, i32) - declare void @llvm.wasm.store8.lane(i8*, <16 x i8>, i32) declare void @llvm.wasm.store16.lane(i16*, <8 x i16>, i32) declare void @llvm.wasm.store32.lane(i32*, <4 x i32>, i32) @@ -30,7 +25,8 @@ ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: v128.load8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* %p, <16 x i8> %v, i32 0) + %x = load i8, i8* %p + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -47,7 +43,8 @@ %q = ptrtoint i8* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i8* - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = load i8, i8* %s + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -62,7 +59,8 @@ ; CHECK-NEXT: v128.load8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i8, i8* %p, i32 6 - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = load i8, i8* %s + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -77,7 +75,8 @@ ; CHECK-NEXT: v128.load8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i8, i8* %p, i32 -6 - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = load i8, i8* %s + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -94,7 +93,8 @@ %q = ptrtoint i8* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i8* - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = load i8, i8* %s + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -109,7 +109,8 @@ ; CHECK-NEXT: v128.load8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i8, i8* %p, i32 6 - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = load i8, i8* %s + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -122,7 +123,8 @@ ; CHECK-NEXT: v128.load8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i8* - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = load i8, i8* %s + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -135,7 +137,8 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.load8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <16 x i8> @llvm.wasm.load8.lane(i8* @gv_i8, <16 x i8> %v, i32 0) + %x = load i8, i8* @gv_i8 + %t = insertelement <16 x i8> %v, i8 %x, i32 0 ret <16 x i8> %t } @@ -265,9 +268,10 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* %p, <8 x i16> %v, i32 0) + %x = load i16, i16* %p + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -279,12 +283,13 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i16* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i16* - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = load i16, i16* %s + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -296,10 +301,11 @@ ; CHECK-NEXT: i32.const 12 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i16, i16* %p, i32 6 - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = load i16, i16* %s + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -311,10 +317,11 @@ ; CHECK-NEXT: i32.const -12 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i16, i16* %p, i32 -6 - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = load i16, i16* %s + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -326,12 +333,13 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i16* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i16* - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = load i16, i16* %s + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -343,10 +351,11 @@ ; CHECK-NEXT: i32.const 12 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i16, i16* %p, i32 6 - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = load i16, i16* %s + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -356,10 +365,11 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 42 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i16* - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = load i16, i16* %s + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -370,9 +380,10 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const gv_i16 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <8 x i16> @llvm.wasm.load16.lane(i16* @gv_i16, <8 x i16> %v, i32 0) + %x = load i16, i16* @gv_i16 + %t = insertelement <8 x i16> %v, i16 %x, i32 0 ret <8 x i16> %t } @@ -502,9 +513,10 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* %p, <4 x i32> %v, i32 0) + %x = load i32, i32* %p + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -516,12 +528,13 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i32* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i32* - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = load i32, i32* %s + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -533,10 +546,11 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i32, i32* %p, i32 6 - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = load i32, i32* %s + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -548,10 +562,11 @@ ; CHECK-NEXT: i32.const -24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i32, i32* %p, i32 -6 - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = load i32, i32* %s + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -563,12 +578,13 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i32* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i32* - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = load i32, i32* %s + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -580,10 +596,11 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i32, i32* %p, i32 6 - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = load i32, i32* %s + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -593,10 +610,11 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 42 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i32* - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = load i32, i32* %s + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -607,9 +625,10 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const gv_i32 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <4 x i32> @llvm.wasm.load32.lane(i32* @gv_i32, <4 x i32> %v, i32 0) + %x = load i32, i32* @gv_i32 + %t = insertelement <4 x i32> %v, i32 %x, i32 0 ret <4 x i32> %t } @@ -739,9 +758,10 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* %p, <2 x i64> %v, i32 0) + %x = load i64, i64* %p + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } @@ -753,12 +773,13 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i64* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i64* - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = load i64, i64* %s + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } @@ -770,10 +791,11 @@ ; CHECK-NEXT: i32.const 48 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i64, i64* %p, i32 6 - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = load i64, i64* %s + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } @@ -785,10 +807,11 @@ ; CHECK-NEXT: i32.const -48 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i64, i64* %p, i32 -6 - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = load i64, i64* %s + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } @@ -800,12 +823,13 @@ ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i64* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i64* - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = load i64, i64* %s + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } @@ -817,10 +841,11 @@ ; CHECK-NEXT: i32.const 48 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i64, i64* %p, i32 6 - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = load i64, i64* %s + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } @@ -830,10 +855,11 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 42 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i64* - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = load i64, i64* %s + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } @@ -844,9 +870,10 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const gv_i64 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.load64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - %t = tail call <2 x i64> @llvm.wasm.load64.lane(i64* @gv_i64, <2 x i64> %v, i32 0) + %x = load i64, i64* @gv_i64 + %t = insertelement <2 x i64> %v, i64 %x, i32 0 ret <2 x i64> %t } diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll --- a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll @@ -133,6 +133,34 @@ ret <16 x i8> %v2 } +; 1 is the default alignment for v128.load8_lane so no attribute is needed. +define <16 x i8> @load_lane_i8_a1(i8* %p, <16 x i8> %v) { +; CHECK-LABEL: load_lane_i8_a1: +; CHECK: .functype load_lane_i8_a1 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load8_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i8, i8* %p, align 1 + %v1 = insertelement <16 x i8> %v, i8 %e, i32 0 + ret <16 x i8> %v1 +} + +; 2 is greater than the default alignment so it is ignored. +define <16 x i8> @load_lane_i8_a2(i8* %p, <16 x i8> %v) { +; CHECK-LABEL: load_lane_i8_a2: +; CHECK: .functype load_lane_i8_a2 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load8_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i8, i8* %p, align 2 + %v1 = insertelement <16 x i8> %v, i8 %e, i32 0 + ret <16 x i8> %v1 +} + ; ============================================================================== ; 8 x i16 ; ============================================================================== @@ -393,6 +421,47 @@ ret <8 x i16> %v2 } +define <8 x i16> @load_lane_i16_a1(i16* %p, <8 x i16> %v) { +; CHECK-LABEL: load_lane_i16_a1: +; CHECK: .functype load_lane_i16_a1 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load16_lane 0:p2align=0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i16, i16* %p, align 1 + %v1 = insertelement <8 x i16> %v, i16 %e, i32 0 + ret <8 x i16> %v1 +} + +; 2 is the default alignment for v128.load16_lane so no attribute is needed. +define <8 x i16> @load_lane_i16_a2(i16* %p, <8 x i16> %v) { +; CHECK-LABEL: load_lane_i16_a2: +; CHECK: .functype load_lane_i16_a2 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load16_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i16, i16* %p, align 2 + %v1 = insertelement <8 x i16> %v, i16 %e, i32 0 + ret <8 x i16> %v1 +} + +; 4 is greater than the default alignment so it is ignored. +define <8 x i16> @load_lane_i16_a4(i16* %p, <8 x i16> %v) { +; CHECK-LABEL: load_lane_i16_a4: +; CHECK: .functype load_lane_i16_a4 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load16_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i16, i16* %p, align 4 + %v1 = insertelement <8 x i16> %v, i16 %e, i32 0 + ret <8 x i16> %v1 +} + ; ============================================================================== ; 4 x i32 ; ============================================================================== @@ -666,6 +735,60 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_lane_i32_a1(i32* %p, <4 x i32> %v) { +; CHECK-LABEL: load_lane_i32_a1: +; CHECK: .functype load_lane_i32_a1 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load32_lane 0:p2align=0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i32, i32* %p, align 1 + %v1 = insertelement <4 x i32> %v, i32 %e, i32 0 + ret <4 x i32> %v1 +} + +define <4 x i32> @load_lane_i32_a2(i32* %p, <4 x i32> %v) { +; CHECK-LABEL: load_lane_i32_a2: +; CHECK: .functype load_lane_i32_a2 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load32_lane 0:p2align=1, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i32, i32* %p, align 2 + %v1 = insertelement <4 x i32> %v, i32 %e, i32 0 + ret <4 x i32> %v1 +} + +; 4 is the default alignment for v128.load32_lane so no attribute is needed. +define <4 x i32> @load_lane_i32_a4(i32* %p, <4 x i32> %v) { +; CHECK-LABEL: load_lane_i32_a4: +; CHECK: .functype load_lane_i32_a4 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load32_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i32, i32* %p, align 4 + %v1 = insertelement <4 x i32> %v, i32 %e, i32 0 + ret <4 x i32> %v1 +} + +; 8 is greater than the default alignment so it is ignored. +define <4 x i32> @load_lane_i32_a8(i32* %p, <4 x i32> %v) { +; CHECK-LABEL: load_lane_i32_a8: +; CHECK: .functype load_lane_i32_a8 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load32_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i32, i32* %p, align 8 + %v1 = insertelement <4 x i32> %v, i32 %e, i32 0 + ret <4 x i32> %v1 +} + ; ============================================================================== ; 2 x i64 ; ============================================================================== @@ -833,6 +956,73 @@ ret <2 x i64> %v2 } +define <2 x i64> @load_lane_i64_a1(i64* %p, <2 x i64> %v) { +; CHECK-LABEL: load_lane_i64_a1: +; CHECK: .functype load_lane_i64_a1 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load64_lane 0:p2align=0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i64, i64* %p, align 1 + %v1 = insertelement <2 x i64> %v, i64 %e, i32 0 + ret <2 x i64> %v1 +} + +define <2 x i64> @load_lane_i64_a2(i64* %p, <2 x i64> %v) { +; CHECK-LABEL: load_lane_i64_a2: +; CHECK: .functype load_lane_i64_a2 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load64_lane 0:p2align=1, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i64, i64* %p, align 2 + %v1 = insertelement <2 x i64> %v, i64 %e, i32 0 + ret <2 x i64> %v1 +} + +define <2 x i64> @load_lane_i64_a4(i64* %p, <2 x i64> %v) { +; CHECK-LABEL: load_lane_i64_a4: +; CHECK: .functype load_lane_i64_a4 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load64_lane 0:p2align=2, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i64, i64* %p, align 4 + %v1 = insertelement <2 x i64> %v, i64 %e, i32 0 + ret <2 x i64> %v1 +} + +; 8 is the default alignment for v128.load64_lane so no attribute is needed. +define <2 x i64> @load_lane_i64_a8(i64* %p, <2 x i64> %v) { +; CHECK-LABEL: load_lane_i64_a8: +; CHECK: .functype load_lane_i64_a8 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load64_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i64, i64* %p, align 8 + %v1 = insertelement <2 x i64> %v, i64 %e, i32 0 + ret <2 x i64> %v1 +} + +; 16 is greater than the default alignment so it is ignored. +define <2 x i64> @load_lane_i64_a16(i64* %p, <2 x i64> %v) { +; CHECK-LABEL: load_lane_i64_a16: +; CHECK: .functype load_lane_i64_a16 (i32, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load64_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %e = load i64, i64* %p, align 16 + %v1 = insertelement <2 x i64> %v, i64 %e, i32 0 + ret <2 x i64> %v1 +} + ; ============================================================================== ; 4 x float ; ==============================================================================