diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -129,8 +129,12 @@ TARGET_BUILTIN(__builtin_wasm_min_f32x4, "V4fV4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_f32x4, "V4fV4fV4f", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmin_f32x4, "V4fV4fV4f", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmax_f32x4, "V4fV4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17799,6 +17799,22 @@ CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_pmin_f32x4: + case WebAssembly::BI__builtin_wasm_pmin_f64x2: { + Value *LHS = EmitScalarExpr(E->getArg(0)); + Value *RHS = EmitScalarExpr(E->getArg(1)); + Function *Callee = + CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType())); + return Builder.CreateCall(Callee, {LHS, RHS}); + } + case WebAssembly::BI__builtin_wasm_pmax_f32x4: + case WebAssembly::BI__builtin_wasm_pmax_f64x2: { + Value *LHS = EmitScalarExpr(E->getArg(0)); + Value *RHS = EmitScalarExpr(E->getArg(1)); + Function *Callee = + CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType())); + return Builder.CreateCall(Callee, {LHS, RHS}); + } case WebAssembly::BI__builtin_wasm_ceil_f32x4: case WebAssembly::BI__builtin_wasm_floor_f32x4: case WebAssembly::BI__builtin_wasm_trunc_f32x4: diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -1150,14 +1150,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a, v128_t __b) { - __i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a); - return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a, v128_t __b) { - __i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b); - return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) { @@ -1220,14 +1218,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a, v128_t __b) { - __i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a); - return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a, v128_t __b) { - __i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b); - return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -506,6 +506,20 @@ // WEBASSEMBLY-NEXT: ret } +f32x4 pmin_f32x4(f32x4 x, f32x4 y) { + return __builtin_wasm_pmin_f32x4(x, y); + // WEBASSEMBLY: call <4 x float> @llvm.wasm.pmin.v4f32( + // WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y) + // WEBASSEMBLY-NEXT: ret +} + +f32x4 pmax_f32x4(f32x4 x, f32x4 y) { + return __builtin_wasm_pmax_f32x4(x, y); + // WEBASSEMBLY: call <4 x float> @llvm.wasm.pmax.v4f32( + // WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y) + // WEBASSEMBLY-NEXT: ret +} + f64x2 min_f64x2(f64x2 x, f64x2 y) { return __builtin_wasm_min_f64x2(x, y); // WEBASSEMBLY: call <2 x double> @llvm.minimum.v2f64( @@ -520,6 +534,20 @@ // WEBASSEMBLY-NEXT: ret } +f64x2 pmin_f64x2(f64x2 x, f64x2 y) { + return __builtin_wasm_pmin_f64x2(x, y); + // WEBASSEMBLY: call <2 x double> @llvm.wasm.pmin.v2f64( + // WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y) + // WEBASSEMBLY-NEXT: ret +} + +f64x2 pmax_f64x2(f64x2 x, f64x2 y) { + return __builtin_wasm_pmax_f64x2(x, y); + // WEBASSEMBLY: call <2 x double> @llvm.wasm.pmax.v2f64( + // WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y) + // WEBASSEMBLY-NEXT: ret +} + f32x4 ceil_f32x4(f32x4 x) { return __builtin_wasm_ceil_f32x4(x); // WEBASSEMBLY: call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c --- a/clang/test/Headers/wasm.c +++ b/clang/test/Headers/wasm.c @@ -2191,11 +2191,11 @@ // CHECK-LABEL: @test_f32x4_pmin( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]] -// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] // v128_t test_f32x4_pmin(v128_t a, v128_t b) { return wasm_f32x4_pmin(a, b); @@ -2205,9 +2205,9 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]] -// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] // v128_t test_f32x4_pmax(v128_t a, v128_t b) { return wasm_f32x4_pmax(a, b); @@ -2364,10 +2364,9 @@ // CHECK-LABEL: @test_f64x2_pmin( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP0]], <2 x double> [[TMP1]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2379,8 +2378,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP1]], <2 x double> [[TMP0]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -164,6 +164,15 @@ [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, IntrSpeculatable]>; +def int_wasm_pmin : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable]>; +def int_wasm_pmax : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable]>; + def int_wasm_extadd_pairwise_signed : Intrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>], diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1165,6 +1165,16 @@ (pmax $lhs, $rhs)>; } +// And match the pmin/pmax LLVM intrinsics as well +def : Pat<(v4f32 (int_wasm_pmin (v4f32 V128:$lhs), (v4f32 V128:$rhs))), + (PMIN_F32x4 V128:$lhs, V128:$rhs)>; +def : Pat<(v4f32 (int_wasm_pmax (v4f32 V128:$lhs), (v4f32 V128:$rhs))), + (PMAX_F32x4 V128:$lhs, V128:$rhs)>; +def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))), + (PMIN_F64x2 V128:$lhs, V128:$rhs)>; +def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))), + (PMAX_F64x2 V128:$lhs, V128:$rhs)>; + //===----------------------------------------------------------------------===// // Conversions //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -540,6 +540,26 @@ ret <4 x float> %a } +; CHECK-LABEL: pmin_v4f32: +; CHECK-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <4 x float> @llvm.wasm.pmin.v4f32(<4 x float>, <4 x float>) +define <4 x float> @pmin_v4f32(<4 x float> %a, <4 x float> %b) { + %v = call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %v +} + +; CHECK-LABEL: pmax_v4f32: +; CHECK-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <4 x float> @llvm.wasm.pmax.v4f32(<4 x float>, <4 x float>) +define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) { + %v = call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %v +} + ; CHECK-LABEL: ceil_v4f32: ; CHECK-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}} ; CHECK-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}} @@ -595,6 +615,26 @@ ret <2 x double> %a } +; CHECK-LABEL: pmin_v2f64: +; CHECK-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <2 x double> @llvm.wasm.pmin.v2f64(<2 x double>, <2 x double>) +define <2 x double> @pmin_v2f64(<2 x double> %a, <2 x double> %b) { + %v = call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %v +} + +; CHECK-LABEL: pmax_v2f64: +; CHECK-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <2 x double> @llvm.wasm.pmax.v2f64(<2 x double>, <2 x double>) +define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) { + %v = call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %v +} + ; CHECK-LABEL: ceil_v2f64: ; CHECK-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}} ; CHECK-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}}