diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -191,7 +191,6 @@ TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4, "V4iV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4, "V4UiV2d", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_demote_zero_f64x2_f32x4, "V4fV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4iiC*", "n", "simd128") TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLiC*", "n", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17723,11 +17723,6 @@ Builder.getInt32(2), Builder.getInt32(3)}); return Builder.CreateShuffleVector(Trunc, Splat, ConcatMask); } - case WebAssembly::BI__builtin_wasm_demote_zero_f64x2_f32x4: { - Value *Vec = EmitScalarExpr(E->getArg(0)); - Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_demote_zero); - return Builder.CreateCall(Callee, Vec); - } case WebAssembly::BI__builtin_wasm_load32_zero: { Value *Ptr = EmitScalarExpr(E->getArg(0)); Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load32_zero); diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -1151,7 +1151,9 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_demote_f64x2_zero(v128_t __a) { - return (v128_t)__builtin_wasm_demote_zero_f64x2_f32x4((__f64x2)__a); + return (v128_t) __builtin_convertvector( + __builtin_shufflevector((__f64x2)__a, (__f64x2){0, 0}, 0, 1, 2, 3), + __f32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -892,12 +892,6 @@ // WEBASSEMBLY: ret <4 x i32> %1 } -f32x4 wasm_demote_zero_f64x2_f32x4(f64x2 x) { - return __builtin_wasm_demote_zero_f64x2_f32x4(x); - // WEBASSEMBLY: call <4 x float> @llvm.wasm.demote.zero(<2 x double> %x) - // WEBASSEMBLY: ret -} - i32x4 load32_zero(const int *p) { return __builtin_wasm_load32_zero(p); // WEBASSEMBLY: call <4 x i32> @llvm.wasm.load32.zero(i32* %p) diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c --- a/clang/test/Headers/wasm.c +++ b/clang/test/Headers/wasm.c @@ -2465,9 +2465,10 @@ // CHECK-LABEL: @test_f32x4_demote_f64x2_zero( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.wasm.demote.zero(<2 x double> [[TMP0]]) #[[ATTR10]] -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> zeroinitializer, <4 x i32> +// CHECK-NEXT: [[CONV_I:%.*]] = fptrunc <4 x double> [[SHUFFLE_I]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[CONV_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_f32x4_demote_f64x2_zero(v128_t a) { return wasm_f32x4_demote_f64x2_zero(a); diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -263,11 +263,6 @@ [LLVMSubdivide2VectorType<0>], [IntrNoMem, IntrSpeculatable]>; -// TODO: Remove this if possible. -def int_wasm_demote_zero : - Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], - [IntrNoMem, IntrSpeculatable]>; - //===----------------------------------------------------------------------===// // Thread-local storage intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -40,6 +40,7 @@ HANDLE_NODETYPE(PROMOTE_LOW) HANDLE_NODETYPE(TRUNC_SAT_ZERO_S) HANDLE_NODETYPE(TRUNC_SAT_ZERO_U) +HANDLE_NODETYPE(DEMOTE_ZERO) HANDLE_NODETYPE(THROW) HANDLE_NODETYPE(CATCH) HANDLE_NODETYPE(MEMORY_COPY) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -156,7 +156,11 @@ setTargetDAGCombine(ISD::FP_EXTEND); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); - // Combine concat of {s,u}int_to_fp_sat to i32x4.trunc_sat_f64x2_zero_{s,u} + // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa + // into conversion ops + setTargetDAGCombine(ISD::FP_TO_SINT_SAT); + setTargetDAGCombine(ISD::FP_TO_UINT_SAT); + setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::CONCAT_VECTORS); // Support saturating add for i8x16 and i16x8 @@ -2294,45 +2298,121 @@ } static SDValue -performVectorTruncSatLowCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { +performVectorTruncZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto &DAG = DCI.DAG; - assert(N->getOpcode() == ISD::CONCAT_VECTORS); + + auto GetWasmConversionOp = [](unsigned Op) { + switch (Op) { + case ISD::FP_TO_SINT_SAT: + return WebAssemblyISD::TRUNC_SAT_ZERO_S; + case ISD::FP_TO_UINT_SAT: + return WebAssemblyISD::TRUNC_SAT_ZERO_U; + case ISD::FP_ROUND: + return WebAssemblyISD::DEMOTE_ZERO; + } + llvm_unreachable("unexpected op"); + }; + + auto IsZeroSplat = [](SDValue SplatVal) { + auto *Splat = dyn_cast(SplatVal.getNode()); + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + return Splat && + Splat->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, + HasAnyUndefs) && + SplatValue == 0; + }; + + if (N->getOpcode() == ISD::CONCAT_VECTORS) { + // Combine this: + // + // (concat_vectors (v2i32 (fp_to_{s,u}int_sat $x, 32)), (v2i32 (splat 0))) + // + // into (i32x4.trunc_sat_f64x2_zero_{s,u} $x). + // + // Or this: + // + // (concat_vectors (v2f32 (fp_round (v2f64 $x))), (v2f32 (splat 0))) + // + // into (f32x4.demote_zero_f64x2 $x). + EVT ResVT; + EVT ExpectedConversionType; + auto Conversion = N->getOperand(0); + auto ConversionOp = Conversion.getOpcode(); + switch (ConversionOp) { + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: + ResVT = MVT::v4i32; + ExpectedConversionType = MVT::v2i32; + break; + case ISD::FP_ROUND: + ResVT = MVT::v4f32; + ExpectedConversionType = MVT::v2f32; + break; + default: + return SDValue(); + } + + if (N->getValueType(0) != ResVT) + return SDValue(); + + if (Conversion.getValueType() != ExpectedConversionType) + return SDValue(); + + auto Source = Conversion.getOperand(0); + if (Source.getValueType() != MVT::v2f64) + return SDValue(); + + if (!IsZeroSplat(N->getOperand(1)) || + N->getOperand(1).getValueType() != ExpectedConversionType) + return SDValue(); + + unsigned Op = GetWasmConversionOp(ConversionOp); + return DAG.getNode(Op, SDLoc(N), ResVT, Source); + } // Combine this: // - // (concat_vectors (v2i32 (fp_to_{s,u}int_sat $x, 32)), (v2i32 (splat 0))) + // (fp_to_{s,u}int_sat (concat_vectors $x, (v2f64 (splat 0))), 32) // // into (i32x4.trunc_sat_f64x2_zero_{s,u} $x). - EVT ResVT = N->getValueType(0); - if (ResVT != MVT::v4i32) - return SDValue(); + // + // Or this: + // + // (v4f32 (fp_round (concat_vectors $x, (v2f64 (splat 0))))) + // + // into (f32x4.demote_zero_f64x2 $x). + EVT ResVT; + auto ConversionOp = N->getOpcode(); + switch (ConversionOp) { + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: + ResVT = MVT::v4i32; + break; + case ISD::FP_ROUND: + ResVT = MVT::v4f32; + break; + default: + llvm_unreachable("unexpected op"); + } - auto FPToInt = N->getOperand(0); - auto FPToIntOp = FPToInt.getOpcode(); - if (FPToIntOp != ISD::FP_TO_SINT_SAT && FPToIntOp != ISD::FP_TO_UINT_SAT) + if (N->getValueType(0) != ResVT) return SDValue(); - if (cast(FPToInt.getOperand(1))->getVT() != MVT::i32) + + auto Concat = N->getOperand(0); + if (Concat.getValueType() != MVT::v4f64) return SDValue(); - auto Source = FPToInt.getOperand(0); + auto Source = Concat.getOperand(0); if (Source.getValueType() != MVT::v2f64) return SDValue(); - auto *Splat = dyn_cast(N->getOperand(1)); - APInt SplatValue, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (!Splat || !Splat->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, - HasAnyUndefs)) + if (!IsZeroSplat(Concat.getOperand(1)) || + Concat.getOperand(1).getValueType() != MVT::v2f64) return SDValue(); - if (SplatValue != 0) - return SDValue(); - - unsigned Op = FPToIntOp == ISD::FP_TO_SINT_SAT - ? WebAssemblyISD::TRUNC_SAT_ZERO_S - : WebAssemblyISD::TRUNC_SAT_ZERO_U; + unsigned Op = GetWasmConversionOp(ConversionOp); return DAG.getNode(Op, SDLoc(N), ResVT, Source); } @@ -2352,7 +2432,10 @@ case ISD::FP_EXTEND: case ISD::EXTRACT_SUBVECTOR: return performVectorConvertLowCombine(N, DCI); + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: + case ISD::FP_ROUND: case ISD::CONCAT_VECTORS: - return performVectorTruncSatLowCombine(N, DCI); + return performVectorTruncZeroCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1289,7 +1289,9 @@ "extadd_pairwise_i16x8_u", 0x7f>; // f64x2 <-> f32x4 conversions -defm "" : SIMDConvert, SDTCisVec<1>]>; +def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>; +defm "" : SIMDConvert; def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll --- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll @@ -82,6 +82,30 @@ ret <2 x i64> %a } +; CHECK-LABEL: demote_zero_v4f32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .functype demote_zero_v4f32 (v128) -> (v128){{$}} +; SIMD128-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0 +; SIMD128-NEXT: return $pop[[R]] +define <4 x float> @demote_zero_v4f32(<2 x double> %x) { + %v = shufflevector <2 x double> %x, <2 x double> zeroinitializer, + <4 x i32> + %a = fptrunc <4 x double> %v to <4 x float> + ret <4 x float> %a +} + +; CHECK-LABEL: demote_zero_v4f32_2: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .functype demote_zero_v4f32_2 (v128) -> (v128){{$}} +; SIMD128-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0 +; SIMD128-NEXT: return $pop[[R]] +define <4 x float> @demote_zero_v4f32_2(<2 x double> %x) { + %v = fptrunc <2 x double> %x to <2 x float> + %a = shufflevector <2 x float> %v, <2 x float> zeroinitializer, + <4 x i32> + ret <4 x float> %a +} + ; CHECK-LABEL: convert_low_s_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-NEXT: .functype convert_low_s_v2f64 (v128) -> (v128){{$}} diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefixes=CHECK,SLOW ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -fast-isel | FileCheck %s ; Test that SIMD128 intrinsics lower as expected. These intrinsics are @@ -542,6 +542,18 @@ ret <4 x i32> %a } +; CHECK-LABEL: trunc_sat_zero_s_v4i32_2: +; CHECK-NEXT: .functype trunc_sat_zero_s_v4i32_2 (v128) -> (v128){{$}} +; SLOW-NEXT: i32x4.trunc_sat_zero_f64x2_s $push[[R:[0-9]+]]=, $0{{$}} +; SLOW-NEXT: return $pop[[R]]{{$}} +declare <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double>) +define <4 x i32> @trunc_sat_zero_s_v4i32_2(<2 x double> %x) { + %v = shufflevector <2 x double> %x, <2 x double> zeroinitializer, + <4 x i32> + %a = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> %v) + ret <4 x i32> %a +} + ; CHECK-LABEL: trunc_sat_zero_u_v4i32: ; CHECK-NEXT: .functype trunc_sat_zero_u_v4i32 (v128) -> (v128){{$}} ; CHECK-NEXT: i32x4.trunc_sat_zero_f64x2_u $push[[R:[0-9]+]]=, $0{{$}} @@ -554,6 +566,18 @@ ret <4 x i32> %a } +; CHECK-LABEL: trunc_sat_zero_u_v4i32_2: +; CHECK-NEXT: .functype trunc_sat_zero_u_v4i32_2 (v128) -> (v128){{$}} +; SLOW-NEXT: i32x4.trunc_sat_zero_f64x2_u $push[[R:[0-9]+]]=, $0{{$}} +; SLOW-NEXT: return $pop[[R]]{{$}} +declare <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double>) +define <4 x i32> @trunc_sat_zero_u_v4i32_2(<2 x double> %x) { + %v = shufflevector <2 x double> %x, <2 x double> zeroinitializer, + <4 x i32> + %a = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> %v) + ret <4 x i32> %a +} + ; ============================================================================== ; 2 x i64 ; ============================================================================== @@ -722,16 +746,6 @@ ret <4 x float> %v } -; CHECK-LABEL: demote_zero_v4f32: -; CHECK-NEXT: .functype demote_zero_v4f32 (v128) -> (v128){{$}} -; CHECK-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x float> @llvm.wasm.demote.zero(<2 x double>) -define <4 x float> @demote_zero_v4f32(<2 x double> %a) { - %v = call <4 x float> @llvm.wasm.demote.zero(<2 x double> %a) - ret <4 x float> %v -} - ; ============================================================================== ; 2 x f64 ; ==============================================================================