diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -119,6 +119,7 @@ TARGET_BUILTIN(__builtin_wasm_avgr_u_i16x8, "V8sV8sV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16cV16cV16cIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16c", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_any_true_i16x8, "iV8s", "nc", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16044,6 +16044,20 @@ CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()}); return Builder.CreateCall(Callee, Vec); } + case WebAssembly::BI__builtin_wasm_shuffle_v8x16: { + Value *Ops[18]; + size_t OpIdx = 0; + Ops[OpIdx++] = EmitScalarExpr(E->getArg(0)); + Ops[OpIdx++] = EmitScalarExpr(E->getArg(1)); + while (OpIdx < 18) { + llvm::APSInt LaneConst; + if (!E->getArg(OpIdx)->isIntegerConstantExpr(LaneConst, getContext())) + llvm_unreachable("Constant arg isn't actually constant?"); + Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), LaneConst); + } + Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle); + return Builder.CreateCall(Callee, Ops); + } default: return nullptr; } diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -1020,23 +1020,31 @@ #define wasm_v8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \ __c7, __c8, __c9, __c10, __c11, __c12, __c13, \ __c14, __c15) \ - ((v128_t)(__builtin_shufflevector( \ - (__u8x16)(__a), (__u8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5, \ - __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15))) + ((v128_t)__builtin_wasm_shuffle_v8x16( \ + (__i8x16)(__a), (__i8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5, \ + __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15)) #define wasm_v16x8_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \ __c7) \ - ((v128_t)(__builtin_shufflevector((__u16x8)(__a), (__u16x8)(__b), __c0, \ - __c1, __c2, __c3, __c4, __c5, __c6, \ - __c7))) + ((v128_t)__builtin_wasm_shuffle_v8x16( \ + (__i8x16)(__a), (__i8x16)(__b), __c0 * 2, __c0 * 2 + 1, __c1 * 2, \ + __c1 * 2 + 1, __c2 * 2, __c2 * 2 + 1, __c3 * 2, __c3 * 2 + 1, __c4 * 2, \ + __c4 * 2 + 1, __c5 * 2, __c5 * 2 + 1, __c6 * 2, __c6 * 2 + 1, __c7 * 2, \ + __c7 * 2 + 1)) #define wasm_v32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3) \ - ((v128_t)(__builtin_shufflevector((__u32x4)(__a), (__u32x4)(__b), __c0, \ - __c1, __c2, __c3))) + ((v128_t)__builtin_wasm_shuffle_v8x16( \ + (__i8x16)(__a), (__i8x16)(__b), __c0 * 4, __c0 * 4 + 1, __c0 * 4 + 2, \ + __c0 * 4 + 3, __c1 * 4, __c1 * 4 + 1, __c1 * 4 + 2, __c1 * 4 + 3, \ + __c2 * 4, __c2 * 4 + 1, __c2 * 4 + 2, __c2 * 4 + 3, __c3 * 4, \ + __c3 * 4 + 1, __c3 * 4 + 2, __c3 * 4 + 3)) #define wasm_v64x2_shuffle(__a, __b, __c0, __c1) \ - ((v128_t)( \ - __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), __c0, __c1))) + ((v128_t)__builtin_wasm_shuffle_v8x16( \ + (__i8x16)(__a), (__i8x16)(__b), __c0 * 8, __c0 * 8 + 1, __c0 * 8 + 2, \ + __c0 * 8 + 3, __c0 * 8 + 4, __c0 * 8 + 5, __c0 * 8 + 6, __c0 * 8 + 7, \ + __c1 * 8, __c1 * 8 + 1, __c1 * 8 + 2, __c1 * 8 + 3, __c1 * 8 + 4, \ + __c1 * 8 + 5, __c1 * 8 + 6, __c1 * 8 + 7)) #ifdef __wasm_unimplemented_simd128__ diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -724,5 +724,14 @@ i8x16 swizzle_v8x16(i8x16 x, i8x16 y) { return __builtin_wasm_swizzle_v8x16(x, y); // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y) +} + +i8x16 shuffle(i8x16 x, i8x16 y) { + return __builtin_wasm_shuffle_v8x16(x, y, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + // WEBASSEMBLY: call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y, + // WEBASSEMBLY-SAME: i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, + // WEBASSEMBLY-SAME: i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, + // WEBASSEMBLY-SAME: i32 15 // WEBASSEMBLY-NEXT: ret } diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -104,6 +104,13 @@ Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, IntrSpeculatable]>; +def int_wasm_shuffle : + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; def int_wasm_sub_saturate_signed : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], @@ -116,7 +123,6 @@ Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; - def int_wasm_bitselect : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], @@ -170,7 +176,6 @@ [llvm_anyvector_ty], [IntrNoMem, IntrSpeculatable]>; - //===----------------------------------------------------------------------===// // Bulk memory intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1353,6 +1353,24 @@ Op.getOperand(3) // thrown value }); } + + case Intrinsic::wasm_shuffle: { + // Drop in-chain and replace undefs, but otherwise pass through unchanged + SDValue Ops[18]; + size_t OpIdx = 0; + Ops[OpIdx++] = Op.getOperand(1); + Ops[OpIdx++] = Op.getOperand(2); + while (OpIdx < 18) { + const SDValue &MaskIdx = Op.getOperand(OpIdx + 1); + if (MaskIdx.isUndef() || + cast(MaskIdx.getNode())->getZExtValue() >= 32) { + Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32); + } else { + Ops[OpIdx++] = MaskIdx; + } + } + return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops); + } } } diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -141,6 +141,36 @@ ret <16 x i8> %a } +; CHECK-LABEL: shuffle_v16i8: +; NO-SIMD128-NOT: v8x16 +; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}} +; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1, +; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare <16 x i8> @llvm.wasm.shuffle( + <16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32) +define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) { + %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y, + i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, + i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 35) + ret <16 x i8> %res +} + +; CHECK-LABEL: shuffle_undef_v16i8: +; NO-SIMD128-NOT: v8x16 +; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}} +; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1, +; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) { + %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y, + i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, + i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, + i32 undef, i32 undef, i32 undef, i32 2) + ret <16 x i8> %res +} + ; ============================================================================== ; 8 x i16 ; ==============================================================================