diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -285,14 +285,16 @@ defm "" : SIMDLoadZero; // Use load_zero to load scalars into vectors as well where possible. -// TODO: i32, i16, and i8 scalars -def load_scalar : - PatFrag<(ops node:$addr), (scalar_to_vector (i64 (load $addr)))>; -defm : LoadPatNoOffset; -defm : LoadPatImmOff; -defm : LoadPatImmOff; -defm : LoadPatOffsetOnly; -defm : LoadPatGlobalAddrOffOnly; +// TODO: i16, and i8 scalars +foreach vec = [I32x4, I64x2] in { + defvar inst = "LOAD_ZERO_"#vec; + defvar pat = PatFrag<(ops node:$addr), (scalar_to_vector (vec.lane_vt (load $addr)))>; + defm : LoadPatNoOffset; + defm : LoadPatImmOff; + defm : LoadPatImmOff; + defm : LoadPatOffsetOnly; + defm : LoadPatGlobalAddrOffOnly; +} // TODO: f32x4 and f64x2 as well foreach vec = [I32x4, I64x2] in { diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll --- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll @@ -1160,9 +1160,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_sext_v4i32(<4 x i16>* %p) { -; CHECK-LABEL: load_sext_v4i32: -; CHECK: .functype load_sext_v4i32 (i32) -> (v128) +define <4 x i32> @load_sext_v4i16_to_v4i32(<4 x i16>* %p) { +; CHECK-LABEL: load_sext_v4i16_to_v4i32: +; CHECK: .functype load_sext_v4i16_to_v4i32 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.load16x4_s 0 @@ -1172,9 +1172,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32(<4 x i16>* %p) { -; CHECK-LABEL: load_zext_v4i32: -; CHECK: .functype load_zext_v4i32 (i32) -> (v128) +define <4 x i32> @load_zext_v4i16_to_v4i32(<4 x i16>* %p) { +; CHECK-LABEL: load_zext_v4i16_to_v4i32: +; CHECK: .functype load_zext_v4i16_to_v4i32 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.load16x4_u 0 @@ -1184,6 +1184,39 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_sext_v4i8_to_v4i32(<4 x i8>* %p) { +; CHECK-LABEL: load_sext_v4i8_to_v4i32: +; CHECK: .functype load_sext_v4i8_to_v4i32 (i32) -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %v = load <4 x i8>, <4 x i8>* %p + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_v4i8_to_v4i32(<4 x i8>* %p) { +; CHECK-LABEL: load_zext_v4i8_to_v4i32: +; CHECK: .functype load_zext_v4i8_to_v4i32 (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %v = load <4 x i8>, <4 x i8>* %p + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32(<4 x i16>* %p) { ; CHECK-LABEL: load_ext_v4i32: ; CHECK: .functype load_ext_v4i32 (i32) -> (v128) @@ -1225,9 +1258,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_sext_v4i32_with_folded_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_sext_v4i32_with_folded_offset: -; CHECK: .functype load_sext_v4i32_with_folded_offset (i32) -> (v128) +define <4 x i32> @load_sext_from_v4i16_to_v4i32_with_folded_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_sext_from_v4i16_to_v4i32_with_folded_offset: +; CHECK: .functype load_sext_from_v4i16_to_v4i32_with_folded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.load16x4_s 16 @@ -1240,9 +1273,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32_with_folded_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_zext_v4i32_with_folded_offset: -; CHECK: .functype load_zext_v4i32_with_folded_offset (i32) -> (v128) +define <4 x i32> @load_zext_from_v4i16_to_v4i32_with_folded_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_zext_from_v4i16_to_v4i32_with_folded_offset: +; CHECK: .functype load_zext_from_v4i16_to_v4i32_with_folded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.load16x4_u 16 @@ -1255,6 +1288,45 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_sext_from_v4i8_to_v4i32_with_folded_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_sext_from_v4i8_to_v4i32_with_folded_offset: +; CHECK: .functype load_sext_from_v4i8_to_v4i32_with_folded_offset (i32) -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 16 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i8>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i8>* + %v = load <4 x i8>, <4 x i8>* %s + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_from_v4i8_to_v4i32_with_folded_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_zext_from_v4i8_to_v4i32_with_folded_offset: +; CHECK: .functype load_zext_from_v4i8_to_v4i32_with_folded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 16 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i8>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i8>* + %v = load <4 x i8>, <4 x i8>* %s + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32_with_folded_offset(<4 x i16>* %p) { ; CHECK-LABEL: load_ext_v4i32_with_folded_offset: ; CHECK: .functype load_ext_v4i32_with_folded_offset (i32) -> (v128) @@ -1295,9 +1367,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_sext_v4i32_with_folded_gep_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_sext_v4i32_with_folded_gep_offset: -; CHECK: .functype load_sext_v4i32_with_folded_gep_offset (i32) -> (v128) +define <4 x i32> @load_sext_from_v4i16_to_v4i32_with_folded_gep_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_sext_from_v4i16_to_v4i32_with_folded_gep_offset: +; CHECK: .functype load_sext_from_v4i16_to_v4i32_with_folded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.load16x4_s 8 @@ -1308,9 +1380,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32_with_folded_gep_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_zext_v4i32_with_folded_gep_offset: -; CHECK: .functype load_zext_v4i32_with_folded_gep_offset (i32) -> (v128) +define <4 x i32> @load_zext_from_v4i16_to_v4i32_with_folded_gep_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_zext_from_v4i16_to_v4i32_with_folded_gep_offset: +; CHECK: .functype load_zext_from_v4i16_to_v4i32_with_folded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.load16x4_u 8 @@ -1321,6 +1393,41 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_sext_from_v4i8_to_v4i32_with_folded_gep_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_sext_from_v4i8_to_v4i32_with_folded_gep_offset: +; CHECK: .functype load_sext_from_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 4 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 1 + %v = load <4 x i8>, <4 x i8>* %s + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_from_v4i8_to_v4i32_with_folded_gep_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_zext_from_v4i8_to_v4i32_with_folded_gep_offset: +; CHECK: .functype load_zext_from_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 4 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 1 + %v = load <4 x i8>, <4 x i8>* %s + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32_with_folded_gep_offset(<4 x i16>* %p) { ; CHECK-LABEL: load_ext_v4i32_with_folded_gep_offset: ; CHECK: .functype load_ext_v4i32_with_folded_gep_offset (i32) -> (v128) @@ -1363,9 +1470,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_sext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_negative_offset: -; CHECK: .functype load_sext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) +define <4 x i32> @load_sext_from_v4i16_to_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_sext_from_v4i16_to_v4i32_with_unfolded_gep_negative_offset: +; CHECK: .functype load_sext_from_v4i16_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const -8 @@ -1378,9 +1485,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_negative_offset: -; CHECK: .functype load_zext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) +define <4 x i32> @load_zext_from_v4i16_to_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_zext_from_v4i16_to_v4i32_with_unfolded_gep_negative_offset: +; CHECK: .functype load_zext_from_v4i16_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const -8 @@ -1393,6 +1500,45 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_sext_from_v4i8_to_v4i32_with_unfolded_gep_negative_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_sext_from_v4i8_to_v4i32_with_unfolded_gep_negative_offset: +; CHECK: .functype load_sext_from_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const -4 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 -1 + %v = load <4 x i8>, <4 x i8>* %s + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_from_v4i8_to_v4i32_with_unfolded_gep_negative_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_zext_from_v4i8_to_v4i32_with_unfolded_gep_negative_offset: +; CHECK: .functype load_zext_from_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const -4 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 -1 + %v = load <4 x i8>, <4 x i8>* %s + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) { ; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_negative_offset: ; CHECK: .functype load_ext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) @@ -1441,9 +1587,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_sext_v4i32_with_unfolded_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_sext_v4i32_with_unfolded_offset: -; CHECK: .functype load_sext_v4i32_with_unfolded_offset (i32) -> (v128) +define <4 x i32> @load_sext_from_v4i16_to_v4i32_with_unfolded_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_sext_from_v4i16_to_v4i32_with_unfolded_offset: +; CHECK: .functype load_sext_from_v4i16_to_v4i32_with_unfolded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 16 @@ -1458,9 +1604,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32_with_unfolded_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_zext_v4i32_with_unfolded_offset: -; CHECK: .functype load_zext_v4i32_with_unfolded_offset (i32) -> (v128) +define <4 x i32> @load_zext_from_v4i16_to_v4i32_with_unfolded_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_zext_from_v4i16_to_v4i32_with_unfolded_offset: +; CHECK: .functype load_zext_from_v4i16_to_v4i32_with_unfolded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 16 @@ -1475,6 +1621,49 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_sext_from_v4i8_to_v4i32_with_unfolded_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_sext_from_v4i8_to_v4i32_with_unfolded_offset: +; CHECK: .functype load_sext_from_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i8>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i8>* + %v = load <4 x i8>, <4 x i8>* %s + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_from_v4i8_to_v4i32_with_unfolded_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_zext_from_v4i8_to_v4i32_with_unfolded_offset: +; CHECK: .functype load_zext_from_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i8>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i8>* + %v = load <4 x i8>, <4 x i8>* %s + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32_with_unfolded_offset(<4 x i16>* %p) { ; CHECK-LABEL: load_ext_v4i32_with_unfolded_offset: ; CHECK: .functype load_ext_v4i32_with_unfolded_offset (i32) -> (v128) @@ -1521,9 +1710,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_sext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_offset: -; CHECK: .functype load_sext_v4i32_with_unfolded_gep_offset (i32) -> (v128) +define <4 x i32> @load_sext_from_v4i16_to_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_sext_from_v4i16_to_v4i32_with_unfolded_gep_offset: +; CHECK: .functype load_sext_from_v4i16_to_v4i32_with_unfolded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 8 @@ -1536,9 +1725,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) { -; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_offset: -; CHECK: .functype load_zext_v4i32_with_unfolded_gep_offset (i32) -> (v128) +define <4 x i32> @load_zext_from_v4i16_to_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) { +; CHECK-LABEL: load_zext_from_v4i16_to_v4i32_with_unfolded_gep_offset: +; CHECK: .functype load_zext_from_v4i16_to_v4i32_with_unfolded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 8 @@ -1551,6 +1740,45 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_sext_from_v4i8_to_v4i32_with_unfolded_gep_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_sext_from_v4i8_to_v4i32_with_unfolded_gep_offset: +; CHECK: .functype load_sext_from_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 4 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <4 x i8>, <4 x i8>* %p, i32 1 + %v = load <4 x i8>, <4 x i8>* %s + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_from_v4i8_to_v4i32_with_unfolded_gep_offset(<4 x i8>* %p) { +; CHECK-LABEL: load_zext_from_v4i8_to_v4i32_with_unfolded_gep_offset: +; CHECK: .functype load_zext_from_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 4 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <4 x i8>, <4 x i8>* %p, i32 1 + %v = load <4 x i8>, <4 x i8>* %s + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) { ; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_offset: ; CHECK: .functype load_ext_v4i32_with_unfolded_gep_offset (i32) -> (v128) @@ -1591,9 +1819,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_sext_v4i32_from_numeric_address() { -; CHECK-LABEL: load_sext_v4i32_from_numeric_address: -; CHECK: .functype load_sext_v4i32_from_numeric_address () -> (v128) +define <4 x i32> @load_sext_from_v4i16_to_v4i32_from_numeric_address() { +; CHECK-LABEL: load_sext_from_v4i16_to_v4i32_from_numeric_address: +; CHECK: .functype load_sext_from_v4i16_to_v4i32_from_numeric_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: i32x4.load16x4_s 32 @@ -1604,9 +1832,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32_from_numeric_address() { -; CHECK-LABEL: load_zext_v4i32_from_numeric_address: -; CHECK: .functype load_zext_v4i32_from_numeric_address () -> (v128) +define <4 x i32> @load_zext_from_v4i16_to_v4i32_from_numeric_address() { +; CHECK-LABEL: load_zext_from_v4i16_to_v4i32_from_numeric_address: +; CHECK: .functype load_zext_from_v4i16_to_v4i32_from_numeric_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: i32x4.load16x4_u 32 @@ -1617,6 +1845,41 @@ ret <4 x i32> %v2 } +define <4 x i32> @load_sext_from_v4i8_to_v4i32_from_numeric_address() { +; CHECK-LABEL: load_sext_from_v4i8_to_v4i32_from_numeric_address: +; CHECK: .functype load_sext_from_v4i8_to_v4i32_from_numeric_address () -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero 32 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <4 x i8>* + %v = load <4 x i8>, <4 x i8>* %s + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_from_v4i8_to_v4i32_from_numeric_address() { +; CHECK-LABEL: load_zext_from_v4i8_to_v4i32_from_numeric_address: +; CHECK: .functype load_zext_from_v4i8_to_v4i32_from_numeric_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero 32 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <4 x i8>* + %v = load <4 x i8>, <4 x i8>* %s + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32_from_numeric_address() { ; CHECK-LABEL: load_ext_v4i32_from_numeric_address: ; CHECK: .functype load_ext_v4i32_from_numeric_address () -> (v128) @@ -1656,9 +1919,9 @@ } @gv_v4i16 = global <4 x i16> -define <4 x i32> @load_sext_v4i32_from_global_address() { -; CHECK-LABEL: load_sext_v4i32_from_global_address: -; CHECK: .functype load_sext_v4i32_from_global_address () -> (v128) +define <4 x i32> @load_sext_from_v4i16_to_v4i32_from_global_address() { +; CHECK-LABEL: load_sext_from_v4i16_to_v4i32_from_global_address: +; CHECK: .functype load_sext_from_v4i16_to_v4i32_from_global_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: i32x4.load16x4_s gv_v4i16 @@ -1668,9 +1931,9 @@ ret <4 x i32> %v2 } -define <4 x i32> @load_zext_v4i32_from_global_address() { -; CHECK-LABEL: load_zext_v4i32_from_global_address: -; CHECK: .functype load_zext_v4i32_from_global_address () -> (v128) +define <4 x i32> @load_zext_from_v4i16_to_v4i32_from_global_address() { +; CHECK-LABEL: load_zext_from_v4i16_to_v4i32_from_global_address: +; CHECK: .functype load_zext_from_v4i16_to_v4i32_from_global_address () -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: i32x4.load16x4_u gv_v4i16 @@ -1680,6 +1943,40 @@ ret <4 x i32> %v2 } +@gv_v4i8 = global <4 x i8> +define <4 x i32> @load_sext_from_v4i8_to_v4i32_from_global_address() { +; CHECK-LABEL: load_sext_from_v4i8_to_v4i32_from_global_address: +; CHECK: .functype load_sext_from_v4i8_to_v4i32_from_global_address () -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero gv_v4i8 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %v = load <4 x i8>, <4 x i8>* @gv_v4i8 + %v2 = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_zext_from_v4i8_to_v4i32_from_global_address() { +; CHECK-LABEL: load_zext_from_v4i8_to_v4i32_from_global_address: +; CHECK: .functype load_zext_from_v4i8_to_v4i32_from_global_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero gv_v4i8 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: # fallthrough-return + %v = load <4 x i8>, <4 x i8>* @gv_v4i8 + %v2 = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %v2 +} + define <4 x i16> @load_ext_v4i32_from_global_address() { ; CHECK-LABEL: load_ext_v4i32_from_global_address: ; CHECK: .functype load_ext_v4i32_from_global_address () -> (v128)