diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -169,5 +169,8 @@ TARGET_BUILTIN(__builtin_wasm_narrow_s_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4ii*", "nU", "simd128") +TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLi*", "nU", "simd128") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16497,6 +16497,16 @@ CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()}); return Builder.CreateCall(Callee, {Low, High}); } + case WebAssembly::BI__builtin_wasm_load32_zero: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load32_zero); + return Builder.CreateCall(Callee, {Ptr}); + } + case WebAssembly::BI__builtin_wasm_load64_zero: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load64_zero); + return Builder.CreateCall(Callee, {Ptr}); + } case WebAssembly::BI__builtin_wasm_shuffle_v8x16: { Value *Ops[18]; size_t OpIdx = 0; diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -737,6 +737,18 @@ // WEBASSEMBLY: ret } +i32x4 load32_zero(int *p) { + return __builtin_wasm_load32_zero(p); + // WEBASSEMBLY: call <4 x i32> @llvm.wasm.load32.zero(i32* %p) + // WEBASSEMBLY: ret +} + +i64x2 load64_zero(long long *p) { + return __builtin_wasm_load64_zero(p); + // WEBASSEMBLY: call <2 x i64> @llvm.wasm.load64.zero(i64* %p) + // WEBASSEMBLY: ret +} + i8x16 swizzle_v8x16(i8x16 x, i8x16 y) { return __builtin_wasm_swizzle_v8x16(x, y); // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y) diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -190,6 +190,20 @@ [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; +// TODO: Replace these intrinsic with normal ISel patterns once the +// load_zero instructions are merged to the proposal. +def int_wasm_load32_zero : + Intrinsic<[llvm_v4i32_ty], + [LLVMPointerType], + [IntrReadMem, IntrArgMemOnly], + "", [SDNPMemOperand]>; + +def int_wasm_load64_zero : + Intrinsic<[llvm_v2i64_ty], + [LLVMPointerType], + [IntrReadMem, IntrArgMemOnly], + "", [SDNPMemOperand]>; + //===----------------------------------------------------------------------===// // Thread-local storage intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -232,6 +232,7 @@ WASM_LOAD_STORE(ATOMIC_NOTIFY) WASM_LOAD_STORE(ATOMIC_WAIT_I32) WASM_LOAD_STORE(LOAD_SPLAT_v32x4) + WASM_LOAD_STORE(LOAD_ZERO_v4i32) return 2; WASM_LOAD_STORE(LOAD_I64) WASM_LOAD_STORE(LOAD_F64) @@ -254,6 +255,7 @@ WASM_LOAD_STORE(LOAD_EXTEND_U_v4i32) WASM_LOAD_STORE(LOAD_EXTEND_S_v2i64) WASM_LOAD_STORE(LOAD_EXTEND_U_v2i64) + WASM_LOAD_STORE(LOAD_ZERO_v2i64) return 3; WASM_LOAD_STORE(LOAD_V128) WASM_LOAD_STORE(STORE_V128) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -675,6 +675,15 @@ Info.align = Align(8); Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad; return true; + case Intrinsic::wasm_load32_zero: + case Intrinsic::wasm_load64_zero: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = Intrinsic == Intrinsic::wasm_load32_zero ? MVT::i32 : MVT::i64; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Info.memVT == MVT::i32 ? Align(4) : Align(8); + Info.flags = MachineMemOperand::MOLoad; + return true; default: return false; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -70,7 +70,7 @@ multiclass LoadPatNoOffset { def : Pat<(ty (kind I32:$addr)), (!cast(inst # "_A32") 0, 0, I32:$addr)>, Requires<[HasAddr32]>; - def : Pat<(ty (kind I64:$addr)), (!cast(inst # "_A64") 0, 0, I64:$addr)>, + def : Pat<(ty (kind (i64 I64:$addr))), (!cast(inst # "_A64") 0, 0, I64:$addr)>, Requires<[HasAddr64]>; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -163,6 +163,43 @@ "LOAD_EXTEND"#exts[1]#"_"#types[0]>; } +// Load lane into zero vector +multiclass SIMDLoadZero simdop> { + let mayLoad = 1, UseNamedOperandTable = 1 in { + defm LOAD_ZERO_#vec_t#_A32 : + SIMD_I<(outs V128:$dst), + (ins P2Align:$p2align, offset32_op:$off, I32:$addr), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + name#"\t$dst, ${off}(${addr})$p2align", + name#"\t$off$p2align", simdop>; + defm LOAD_ZERO_#vec_t#_A64 : + SIMD_I<(outs V128:$dst), + (ins P2Align:$p2align, offset64_op:$off, I64:$addr), + (outs), (ins P2Align:$p2align, offset64_op:$off), [], + name#"\t$dst, ${off}(${addr})$p2align", + name#"\t$off$p2align", simdop>; + } // mayLoad = 1, UseNamedOperandTable = 1 +} + +// TODO: Also support v4f32 and v2f64 once the instructions are merged +// to the proposal +defm "" : SIMDLoadZero; +defm "" : SIMDLoadZero; + +defm : LoadPatNoOffset; +defm : LoadPatNoOffset; + +defm : LoadPatImmOff; +defm : LoadPatImmOff; + +defm : LoadPatImmOff; +defm : LoadPatImmOff; + +defm : LoadPatOffsetOnly; +defm : LoadPatOffsetOnly; + +defm : LoadPatGlobalAddrOffOnly; +defm : LoadPatGlobalAddrOffOnly; // Store: v128.store let mayStore = 1, UseNamedOperandTable = 1 in { @@ -800,7 +837,7 @@ defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins), [(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))], "i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s", - 180>; + 186>; //===----------------------------------------------------------------------===// // Floating-point unary arithmetic @@ -1038,20 +1075,21 @@ // Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS) //===----------------------------------------------------------------------===// -multiclass SIMDQFM baseInst> { +multiclass SIMDQFM simdopA, + bits<32> simdopS> { defm QFMA_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), [(set (vec_t V128:$dst), (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))], - vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>; + vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", simdopA>; defm QFMS_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), [(set (vec_t V128:$dst), (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))], - vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>; + vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", simdopS>; } -defm "" : SIMDQFM; -defm "" : SIMDQFM; +defm "" : SIMDQFM; +defm "" : SIMDQFM; diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Test SIMD v128.load{32,64}_zero instructions + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +declare <4 x i32> @llvm.wasm.load32.zero(i32*) +declare <2 x i64> @llvm.wasm.load64.zero(i64*) + +;===---------------------------------------------------------------------------- +; v128.load32_zero +;===---------------------------------------------------------------------------- + +define <4 x i32> @load_zero_i32_no_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_no_offset: +; CHECK: .functype load_zero_i32_no_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %v = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %p) + ret <4 x i32> %v +} + +define <4 x i32> @load_zero_i32_with_folded_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_folded_offset: +; CHECK: .functype load_zero_i32_with_folded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 24 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i32* %p to i32 + %r = add nuw i32 %q, 24 + %s = inttoptr i32 %r to i32* + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_folded_gep_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_folded_gep_offset: +; CHECK: .functype load_zero_i32_with_folded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 24 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i32, i32* %p, i32 6 + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_unfolded_gep_negative_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_unfolded_gep_negative_offset: +; CHECK: .functype load_zero_i32_with_unfolded_gep_negative_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const -24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i32, i32* %p, i32 -6 + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_unfolded_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_unfolded_offset: +; CHECK: .functype load_zero_i32_with_unfolded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i32* %p to i32 + %r = add nsw i32 %q, 24 + %s = inttoptr i32 %r to i32* + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_unfolded_gep_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_unfolded_gep_offset: +; CHECK: .functype load_zero_i32_with_unfolded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr i32, i32* %p, i32 6 + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_from_numeric_address() { +; CHECK-LABEL: load_zero_i32_from_numeric_address: +; CHECK: .functype load_zero_i32_from_numeric_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero 42 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 42 to i32* + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +@gv_i32 = global i32 0 +define <4 x i32> @load_zero_i32_from_global_address() { +; CHECK-LABEL: load_zero_i32_from_global_address: +; CHECK: .functype load_zero_i32_from_global_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero gv_i32 +; CHECK-NEXT: # fallthrough-return + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* @gv_i32) + ret <4 x i32> %t +} + +;===---------------------------------------------------------------------------- +; v128.load64_zero +;===---------------------------------------------------------------------------- + +define <2 x i64> @load_zero_i64_no_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_no_offset: +; CHECK: .functype load_zero_i64_no_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %v = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %p) + ret <2 x i64> %v +} + +define <2 x i64> @load_zero_i64_with_folded_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_folded_offset: +; CHECK: .functype load_zero_i64_with_folded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_zero 24 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i64* %p to i32 + %r = add nuw i32 %q, 24 + %s = inttoptr i32 %r to i64* + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_folded_gep_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_folded_gep_offset: +; CHECK: .functype load_zero_i64_with_folded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_zero 48 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i64, i64* %p, i64 6 + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_unfolded_gep_negative_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_unfolded_gep_negative_offset: +; CHECK: .functype load_zero_i64_with_unfolded_gep_negative_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const -48 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i64, i64* %p, i64 -6 + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_unfolded_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_unfolded_offset: +; CHECK: .functype load_zero_i64_with_unfolded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i64* %p to i32 + %r = add nsw i32 %q, 24 + %s = inttoptr i32 %r to i64* + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_unfolded_gep_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_unfolded_gep_offset: +; CHECK: .functype load_zero_i64_with_unfolded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr i64, i64* %p, i64 6 + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_from_numeric_address() { +; CHECK-LABEL: load_zero_i64_from_numeric_address: +; CHECK: .functype load_zero_i64_from_numeric_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load64_zero 42 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 42 to i64* + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +@gv_i64 = global i64 0 +define <2 x i64> @load_zero_i64_from_global_address() { +; CHECK-LABEL: load_zero_i64_from_global_address: +; CHECK: .functype load_zero_i64_from_global_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load64_zero gv_i64 +; CHECK-NEXT: # fallthrough-return + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* @gv_i64) + ret <2 x i64> %t +} diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -463,9 +463,6 @@ # CHECK: i32x4.sub # encoding: [0xfd,0xb1,0x01] i32x4.sub - # CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xb4,0x01] - i32x4.dot_i16x8_s - # CHECK: i32x4.mul # encoding: [0xfd,0xb5,0x01] i32x4.mul @@ -481,6 +478,9 @@ # CHECK: i32x4.max_u # encoding: [0xfd,0xb9,0x01] i32x4.max_u + # CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xba,0x01] + i32x4.dot_i16x8_s + # CHECK: i64x2.neg # encoding: [0xfd,0xc1,0x01] i64x2.neg @@ -610,10 +610,16 @@ # CHECK: f32x4.convert_i32x4_u # encoding: [0xfd,0xfb,0x01] f32x4.convert_i32x4_u - # CHECK: f32x4.qfma # encoding: [0xfd,0xfc,0x01] + # CHECK: v128.load32_zero 32 # encoding: [0xfd,0xfc,0x01,0x02,0x20] + v128.load32_zero 32 + + # CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20] + v128.load64_zero 32 + + # CHECK: f32x4.qfma # encoding: [0xfd,0xb4,0x01] f32x4.qfma - # CHECK: f32x4.qfms # encoding: [0xfd,0xfd,0x01] + # CHECK: f32x4.qfms # encoding: [0xfd,0xd4,0x01] f32x4.qfms # CHECK: f64x2.qfma # encoding: [0xfd,0xfe,0x01]