diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -303,6 +303,7 @@ setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal); } + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Legal); } // Don't do anything clever with build_pairs diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1288,6 +1288,19 @@ def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>; defm "" : SIMDConvert; +// Lower extending loads to load64_zero + promote_low +def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let MemoryVT = v2f32; +} +// Adapted from the body of LoadPatNoOffset +// TODO: other addressing patterns +def : Pat<(v2f64 (extloadv2f32 (i32 I32:$addr))), + (promote_low_F64x2 (LOAD_ZERO_I64x2_A32 0, 0, I32:$addr))>, + Requires<[HasAddr32]>; +def : Pat<(v2f64 (extloadv2f32 (i64 I64:$addr))), + (promote_low_F64x2 (LOAD_ZERO_I64x2_A64 0, 0, I64:$addr))>, + Requires<[HasAddr64]>; + //===----------------------------------------------------------------------===// // Saturating Rounding Q-Format Multiplication //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-promote-wide.ll b/llvm/test/CodeGen/WebAssembly/simd-load-promote-wide.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-load-promote-wide.ll @@ -0,0 +1,203 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Test wide load+promote patterns, which after combines and legalization are +; represented differently than 128-bit load+promote patterns. + +target triple = "wasm32-unknown-unknown" + +define <4 x double> @load_promote_v2f64(<4 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64: +; CHECK: .functype load_promote_v2f64 (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %e = load <4 x float>, <4 x float>* %p + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} + +define <4 x double> @load_promote_v2f64_with_folded_offset(<4 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_folded_offset: +; CHECK: .functype load_promote_v2f64_with_folded_offset (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x float>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <4 x float>* + %e = load <4 x float>, <4 x float>* %s + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} + +define <4 x double> @load_promote_v2f64_with_folded_gep_offset(<4 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_folded_gep_offset: +; CHECK: .functype load_promote_v2f64_with_folded_gep_offset (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1 + %e = load <4 x float>, <4 x float>* %s + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} + +define <4 x double> @load_promote_v2f64_with_unfolded_gep_negative_offset(<4 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_unfolded_gep_negative_offset: +; CHECK: .functype load_promote_v2f64_with_unfolded_gep_negative_offset (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const -16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.tee 1 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1 + %e = load <4 x float>, <4 x float>* %s + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} + +define <4 x double> @load_promote_v2f64_with_unfolded_offset(<4 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_unfolded_offset: +; CHECK: .functype load_promote_v2f64_with_unfolded_offset (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x float>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <4 x float>* + %e = load <4 x float>, <4 x float>* %s + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} + +define <4 x double> @load_promote_v2f64_with_unfolded_gep_offset(<4 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_unfolded_gep_offset: +; CHECK: .functype load_promote_v2f64_with_unfolded_gep_offset (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <4 x float>, <4 x float>* %p, i32 1 + %e = load <4 x float>, <4 x float>* %s + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} + +define <4 x double> @load_promote_v2f64_from_numeric_address() { +; CHECK-LABEL: load_promote_v2f64_from_numeric_address: +; CHECK: .functype load_promote_v2f64_from_numeric_address (i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 40 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 32 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <4 x float>* + %e = load <4 x float>, <4 x float>* %s + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} + +@gv_v4f32 = global <4 x float> +define <4 x double> @load_promote_v2f64_from_global_address() { +; CHECK-LABEL: load_promote_v2f64_from_global_address: +; CHECK: .functype load_promote_v2f64_from_global_address (i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const gv_v4f32 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const gv_v4f32 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %e = load <4 x float>, <4 x float>* @gv_v4f32 + %v = fpext <4 x float> %e to <4 x double> + ret <4 x double> %v +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll --- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll @@ -2957,6 +2957,19 @@ ret <2 x double> %v2 } +define <2 x double> @load_promote_v2f64(<2 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64: +; CHECK: .functype load_promote_v2f64 (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %e = load <2 x float>, <2 x float>* %p + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) { ; CHECK-LABEL: load_v2f64_with_folded_offset: ; CHECK: .functype load_v2f64_with_folded_offset (i32) -> (v128) @@ -2987,6 +3000,24 @@ ret <2 x double> %v2 } +define <2 x double> @load_promote_v2f64_with_folded_offset(<2 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_folded_offset: +; CHECK: .functype load_promote_v2f64_with_folded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <2 x float>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <2 x float>* + %e = load <2 x float>, <2 x float>* %s + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) { ; CHECK-LABEL: load_v2f64_with_folded_gep_offset: ; CHECK: .functype load_v2f64_with_folded_gep_offset (i32) -> (v128) @@ -3013,6 +3044,22 @@ ret <2 x double> %v2 } +define <2 x double> @load_promote_v2f64_with_folded_gep_offset(<2 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_folded_gep_offset: +; CHECK: .functype load_promote_v2f64_with_folded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <2 x float>, <2 x float>* %p, i32 1 + %e = load <2 x float>, <2 x float>* %s + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + define <2 x double> @load_v2f64_with_unfolded_gep_negative_offset(<2 x double>* %p) { ; CHECK-LABEL: load_v2f64_with_unfolded_gep_negative_offset: ; CHECK: .functype load_v2f64_with_unfolded_gep_negative_offset (i32) -> (v128) @@ -3043,6 +3090,22 @@ ret <2 x double> %v2 } +define <2 x double> @load_promote_v2f64_with_unfolded_gep_negative_offset(<2 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_unfolded_gep_negative_offset: +; CHECK: .functype load_promote_v2f64_with_unfolded_gep_negative_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const -8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <2 x float>, <2 x float>* %p, i32 -1 + %e = load <2 x float>, <2 x float>* %s + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) { ; CHECK-LABEL: load_v2f64_with_unfolded_offset: ; CHECK: .functype load_v2f64_with_unfolded_offset (i32) -> (v128) @@ -3077,6 +3140,24 @@ ret <2 x double> %v2 } +define <2 x double> @load_promote_v2f64_with_unfolded_offset(<2 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_unfolded_offset: +; CHECK: .functype load_promote_v2f64_with_unfolded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <2 x float>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <2 x float>* + %e = load <2 x float>, <2 x float>* %s + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + define <2 x double> @load_v2f64_with_unfolded_gep_offset(<2 x double>* %p) { ; CHECK-LABEL: load_v2f64_with_unfolded_gep_offset: ; CHECK: .functype load_v2f64_with_unfolded_gep_offset (i32) -> (v128) @@ -3107,6 +3188,22 @@ ret <2 x double> %v2 } +define <2 x double> @load_promote_v2f64_with_unfolded_gep_offset(<2 x float>* %p) { +; CHECK-LABEL: load_promote_v2f64_with_unfolded_gep_offset: +; CHECK: .functype load_promote_v2f64_with_unfolded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <2 x float>, <2 x float>* %p, i32 1 + %e = load <2 x float>, <2 x float>* %s + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + define <2 x double> @load_v2f64_from_numeric_address() { ; CHECK-LABEL: load_v2f64_from_numeric_address: ; CHECK: .functype load_v2f64_from_numeric_address () -> (v128) @@ -3133,6 +3230,20 @@ ret <2 x double> %v2 } +define <2 x double> @load_promote_v2f64_from_numeric_address() { +; CHECK-LABEL: load_promote_v2f64_from_numeric_address: +; CHECK: .functype load_promote_v2f64_from_numeric_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 32 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <2 x float>* + %e = load <2 x float>, <2 x float>* %s + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + @gv_v2f64 = global <2 x double> define <2 x double> @load_v2f64_from_global_address() { ; CHECK-LABEL: load_v2f64_from_global_address: @@ -3159,6 +3270,20 @@ ret <2 x double> %v2 } +@gv_v2f32 = global <2 x float> +define <2 x double> @load_promote_v2f64_from_global_address() { +; CHECK-LABEL: load_promote_v2f64_from_global_address: +; CHECK: .functype load_promote_v2f64_from_global_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const gv_v2f32 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: f64x2.promote_low_f32x4 +; CHECK-NEXT: # fallthrough-return + %e = load <2 x float>, <2 x float>* @gv_v2f32 + %v = fpext <2 x float> %e to <2 x double> + ret <2 x double> %v +} + define void @store_v2f64(<2 x double> %v, <2 x double>* %p) { ; CHECK-LABEL: store_v2f64: ; CHECK: .functype store_v2f64 (v128, i32) -> ()