diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -131,6 +131,7 @@ SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIntrinsic(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -157,6 +157,12 @@ // SIMD-specific configuration if (Subtarget->hasSIMD128()) { + // Combine vector mask reductions into alltrue/anytrue + setTargetDAGCombine(ISD::SETCC); + + // Convert vector to integer bitcasts to bitmask + setTargetDAGCombine(ISD::BITCAST); + // Hoist bitcasts out of shuffles setTargetDAGCombine(ISD::VECTOR_SHUFFLE); @@ -258,6 +264,12 @@ // But saturating fp_to_int converstions are for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}) setOperationAction(Op, MVT::v4i32, Custom); + + // Support vector extending + for (auto T : MVT::integer_fixedlen_vector_valuetypes()) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom); + } } // As a special case, these operators use the type to mean the type to @@ -1374,6 +1386,11 @@ // SIGN_EXTEND_INREG, but for non-vector sign extends the result might be an // illegal type. break; + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + // Do not add any results, signifying that N should not be custom lowered. + // EXTEND_VECTOR_INREG is implemented for some vectors, but not all. + break; default: llvm_unreachable( "ReplaceNodeResults not implemented for this op for WebAssembly!"); @@ -1424,6 +1441,9 @@ return LowerIntrinsic(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::ZERO_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + return LowerEXTEND_VECTOR_INREG(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: @@ -1877,6 +1897,48 @@ Op.getOperand(1)); } +SDValue +WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + + if (SrcVT.getVectorElementType() == MVT::i1 || + SrcVT.getVectorElementType() == MVT::i64) + return SDValue(); + + assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 && + "Unexpected extension factor."); + unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); + + if (Scale != 2 && Scale != 4 && Scale != 8) + return SDValue(); + + unsigned Ext; + switch (Op.getOpcode()) { + case ISD::ZERO_EXTEND_VECTOR_INREG: + Ext = WebAssemblyISD::EXTEND_LOW_U; + break; + case ISD::SIGN_EXTEND_VECTOR_INREG: + Ext = WebAssemblyISD::EXTEND_LOW_S; + break; + } + + SDValue Ret = Src; + while (Scale != 1) { + Ret = DAG.getNode(Ext, DL, + Ret.getValueType() + .widenIntegerVectorElementType(*DAG.getContext()) + .getHalfNumVectorElementsVT(*DAG.getContext()), + Ret); + Scale /= 2; + } + assert(Ret.getValueType() == VT); + return Ret; +} + static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); if (Op.getValueType() != MVT::v2f64) @@ -2692,12 +2754,90 @@ return truncateVectorWithNARROW(OutVT, In, DL, DAG); } +static SDValue performBitcastCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + SDLoc DL(N); + SDValue Src = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT SrcVT = Src.getValueType(); + + // bitcast to iN + // ==> bitmask + if (DCI.isBeforeLegalize() && VT.isScalarInteger() && + SrcVT.isFixedLengthVector() && SrcVT.getScalarType() == MVT::i1) { + unsigned NumElts = SrcVT.getVectorNumElements(); + assert(NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16); + EVT Width = MVT::getIntegerVT(128 / NumElts); + return DAG.getZExtOrTrunc( + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + {DAG.getConstant(Intrinsic::wasm_bitmask, DL, MVT::i32), + DAG.getSExtOrTrunc(N->getOperand(0), DL, + SrcVT.changeVectorElementType(Width))}), + DL, VT); + } + + return SDValue(); +} + +static SDValue performSETCCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + ISD::CondCode Cond = cast(N->getOperand(2))->get(); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // setcc (iN (bitcast (vNi1 X))), 0, ne + // ==> any_true (vNi1 X) + // setcc (iN (bitcast (vNi1 X))), 0, eq + // ==> xor (any_true (vNi1 X)), -1 + // setcc (iN (bitcast (vNi1 X))), -1, eq + // ==> all_true (vNi1 X) + // setcc (iN (bitcast (vNi1 X))), -1, ne + // ==> xor (all_true (vNi1 X)), -1 + if (DCI.isBeforeLegalize() && VT.isScalarInteger() && + (Cond == ISD::SETEQ || Cond == ISD::SETNE) && + (isNullConstant(RHS) || isAllOnesConstant(RHS)) && + LHS->getOpcode() == ISD::BITCAST) { + EVT FromVT = LHS->getOperand(0).getValueType(); + if (FromVT.isFixedLengthVector() && + FromVT.getVectorElementType() == MVT::i1) { + int Intrin = isNullConstant(RHS) ? Intrinsic::wasm_anytrue + : Intrinsic::wasm_alltrue; + unsigned NumElts = FromVT.getVectorNumElements(); + assert(NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16); + EVT Width = MVT::getIntegerVT(128 / NumElts); + SDValue Ret = DAG.getZExtOrTrunc( + DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + {DAG.getConstant(Intrin, DL, MVT::i32), + DAG.getSExtOrTrunc(LHS->getOperand(0), DL, + FromVT.changeVectorElementType(Width))}), + DL, MVT::i1); + if ((isNullConstant(RHS) && (Cond == ISD::SETEQ)) || + (isAllOnesConstant(RHS) && (Cond == ISD::SETNE))) { + Ret = DAG.getNOT(DL, Ret, MVT::i1); + } + return DAG.getZExtOrTrunc(Ret, DL, VT); + } + } + + return SDValue(); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: return SDValue(); + case ISD::BITCAST: + return performBitcastCombine(N, DCI); + case ISD::SETCC: + return performSETCCCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return performVECTOR_SHUFFLECombine(N, DCI); case ISD::SIGN_EXTEND: diff --git a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mattr=+simd128 | FileCheck %s + +;; Test that SIMD bitmask instruction can be selected + +target triple = "wasm32-unknown-unknown" + +define i16 @bitmask_v16i8(<16 x i8> %v) { +; CHECK-LABEL: bitmask_v16i8: +; CHECK: .functype bitmask_v16i8 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.eq +; CHECK-NEXT: i8x16.bitmask +; CHECK-NEXT: # fallthrough-return + %cmp = icmp eq <16 x i8> %v, zeroinitializer + %bitmask = bitcast <16 x i1> %cmp to i16 + ret i16 %bitmask +} + +define i8 @bitmask_v8i16(<8 x i16> %v) { +; CHECK-LABEL: bitmask_v8i16: +; CHECK: .functype bitmask_v8i16 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i16x8.eq +; CHECK-NEXT: i16x8.bitmask +; CHECK-NEXT: # fallthrough-return + %cmp = icmp eq <8 x i16> %v, zeroinitializer + %bitmask = bitcast <8 x i1> %cmp to i8 + ret i8 %bitmask +} + +define i8 @bitmask_v4i32(<4 x i32> %v) { +; CHECK-LABEL: bitmask_v4i32: +; CHECK: .functype bitmask_v4i32 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.const 0, 0, 0, 0 +; CHECK-NEXT: i32x4.eq +; CHECK-NEXT: i32x4.bitmask +; CHECK-NEXT: # fallthrough-return + %cmp = icmp eq <4 x i32> %v, zeroinitializer + %bitmask = bitcast <4 x i1> %cmp to i4 + %ext = zext i4 %bitmask to i8 + ret i8 %ext +} + +define i8 @bitmask_v2i64(<2 x i64> %v) { +; CHECK-LABEL: bitmask_v2i64: +; CHECK: .functype bitmask_v2i64 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.const 0, 0 +; CHECK-NEXT: i64x2.eq +; CHECK-NEXT: i64x2.bitmask +; CHECK-NEXT: # fallthrough-return + %cmp = icmp eq <2 x i64> %v, zeroinitializer + %bitmask = bitcast <2 x i1> %cmp to i2 + %ext = zext i2 %bitmask to i8 + ret i8 %ext +} + +;; Test a smaller vector + +define i8 @bitmask_v8i8(<8 x i8> %v) { +; CHECK-LABEL: bitmask_v8i8: +; CHECK: .functype bitmask_v8i8 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.eq +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i16x8.bitmask +; CHECK-NEXT: # fallthrough-return + %cmp = icmp eq <8 x i8> %v, zeroinitializer + %bitmask = bitcast <8 x i1> %cmp to i8 + ret i8 %bitmask +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll --- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll @@ -36,9 +36,9 @@ ; CHECK-LABEL: extend_to_float_low_i8x16_u: ; CHECK: .functype extend_to_float_low_i8x16_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: f32x4.convert_i32x4_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> @@ -51,8 +51,10 @@ ; CHECK: .functype extend_to_float_high_i8x16_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: i8x16.shuffle 4, 17, 18, 19, 5, 21, 22, 23, 6, 25, 26, 27, 7, 29, 30, 31 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: f32x4.convert_i32x4_u ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> @@ -91,12 +93,8 @@ ; CHECK: .functype extend_to_float_low_i8x16_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> @@ -110,11 +108,9 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 4, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> @@ -138,9 +134,8 @@ ; CHECK-LABEL: extend_to_double_low_i16x4_u: ; CHECK: .functype extend_to_double_low_i16x4_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 16, 17, 2, 3, 18, 19, 6, 7, 20, 21, 10, 11, 22, 23, 14, 15 +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: f64x2.convert_low_i32x4_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending.ll b/llvm/test/CodeGen/WebAssembly/simd-extending.ll --- a/llvm/test/CodeGen/WebAssembly/simd-extending.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending.ll @@ -170,11 +170,8 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0 -; CHECK-NEXT: i32.const 8 -; CHECK-NEXT: i16x8.shl -; CHECK-NEXT: i32.const 8 -; CHECK-NEXT: i16x8.shr_s +; CHECK-NEXT: i8x16.shuffle 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i16x8.extend_low_i8x16_s ; CHECK-NEXT: # fallthrough-return %lowish = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> @@ -188,14 +185,81 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 2, 3, 0, 1, 4, 5, 0, 1, 6, 7, 0, 1, 8, 9, 0, 1 -; CHECK-NEXT: i32.const 16 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 16 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i8x16.shuffle 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %lowish = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %extended = sext <4 x i16> %lowish to <4 x i32> ret <4 x i32> %extended } + +;; Also test vectors that aren't full 128 bits, or might require +;; multiple extensions + +define <16 x i8> @extend_i1x16_i8(<16 x i1> %v) { +; CHECK-LABEL: extend_i1x16_i8: +; CHECK: .functype extend_i1x16_i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 7 +; CHECK-NEXT: i8x16.shl +; CHECK-NEXT: i32.const 7 +; CHECK-NEXT: i8x16.shr_s +; CHECK-NEXT: # fallthrough-return + %extended = sext <16 x i1> %v to <16 x i8> + ret <16 x i8> %extended +} + +define <8 x i8> @extend_i1x8_i8(<8 x i1> %v) { +; CHECK-LABEL: extend_i1x8_i8: +; CHECK: .functype extend_i1x8_i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i32.const 7 +; CHECK-NEXT: i8x16.shl +; CHECK-NEXT: i32.const 7 +; CHECK-NEXT: i8x16.shr_s +; CHECK-NEXT: # fallthrough-return + %extended = sext <8 x i1> %v to <8 x i8> + ret <8 x i8> %extended +} + +define <8 x i16> @extend_i1x8_i16(<8 x i1> %v) { +; CHECK-LABEL: extend_i1x8_i16: +; CHECK: .functype extend_i1x8_i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.const 1, 1, 1, 1, 1, 1, 1, 1 +; CHECK-NEXT: v128.and +; CHECK-NEXT: # fallthrough-return + %extended = zext <8 x i1> %v to <8 x i16> + ret <8 x i16> %extended +} + +define <4 x i32> @extend_i8x4_i32(<4 x i8> %v) { +; CHECK-LABEL: extend_i8x4_i32: +; CHECK: .functype extend_i8x4_i32 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u +; CHECK-NEXT: # fallthrough-return + %extended = zext <4 x i8> %v to <4 x i32> + ret <4 x i32> %extended +} + +define <2 x i64> @extend_i8x2_i64(<2 x i8> %v) { +; CHECK-LABEL: extend_i8x2_i64: +; CHECK: .functype extend_i8x2_i64 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s +; CHECK-NEXT: i64x2.extend_low_i32x4_s +; CHECK-NEXT: # fallthrough-return + %extended = sext <2 x i8> %v to <2 x i64> + ret <2 x i64> %extended +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll --- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll @@ -1183,16 +1183,11 @@ define <4 x i32> @load_sext_v4i8_to_v4i32(ptr %p) { ; CHECK-LABEL: load_sext_v4i8_to_v4i32: ; CHECK: .functype load_sext_v4i8_to_v4i32 (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %v = load <4 x i8>, ptr %p %v2 = sext <4 x i8> %v to <4 x i32> @@ -1203,10 +1198,10 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32: ; CHECK: .functype load_zext_v4i8_to_v4i32 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %v = load <4 x i8>, ptr %p %v2 = zext <4 x i8> %v to <4 x i32> @@ -1287,16 +1282,11 @@ define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_offset(ptr %p) { ; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_offset: ; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.load32_zero 16 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %q = ptrtoint ptr %p to i32 %r = add nuw i32 %q, 16 @@ -1310,10 +1300,10 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_offset: ; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.load32_zero 16 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %q = ptrtoint ptr %p to i32 %r = add nuw i32 %q, 16 @@ -1392,16 +1382,11 @@ define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_gep_offset(ptr %p) { ; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_gep_offset: ; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.load32_zero 4 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i8>, ptr %p, i32 1 %v = load <4 x i8>, ptr %s @@ -1413,10 +1398,10 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_gep_offset: ; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.load32_zero 4 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i8>, ptr %p, i32 1 %v = load <4 x i8>, ptr %s @@ -1499,18 +1484,13 @@ define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(ptr %p) { ; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset: ; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const -4 ; CHECK-NEXT: i32.add ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i8>, ptr %p, i32 -1 %v = load <4 x i8>, ptr %s @@ -1522,12 +1502,12 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset: ; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const -4 ; CHECK-NEXT: i32.add ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds <4 x i8>, ptr %p, i32 -1 %v = load <4 x i8>, ptr %s @@ -1620,18 +1600,13 @@ define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_offset(ptr %p) { ; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_offset: ; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %q = ptrtoint ptr %p to i32 %r = add nsw i32 %q, 16 @@ -1645,12 +1620,12 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_offset: ; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %q = ptrtoint ptr %p to i32 %r = add nsw i32 %q, 16 @@ -1739,18 +1714,13 @@ define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_offset(ptr %p) { ; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_offset: ; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 4 ; CHECK-NEXT: i32.add ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %s = getelementptr <4 x i8>, ptr %p, i32 1 %v = load <4 x i8>, ptr %s @@ -1762,12 +1732,12 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_offset: ; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32.const 4 ; CHECK-NEXT: i32.add ; CHECK-NEXT: v128.load32_zero 0 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %s = getelementptr <4 x i8>, ptr %p, i32 1 %v = load <4 x i8>, ptr %s @@ -1844,16 +1814,11 @@ define <4 x i32> @load_sext_v4i8_to_v4i32_from_numeric_address() { ; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_numeric_address: ; CHECK: .functype load_sext_v4i8_to_v4i32_from_numeric_address () -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: v128.load32_zero 32 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 32 to ptr %v = load <4 x i8>, ptr %s @@ -1865,10 +1830,10 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_numeric_address: ; CHECK: .functype load_zext_v4i8_to_v4i32_from_numeric_address () -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: v128.load32_zero 32 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 32 to ptr %v = load <4 x i8>, ptr %s @@ -1943,16 +1908,11 @@ define <4 x i32> @load_sext_v4i8_to_v4i32_from_global_address() { ; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_global_address: ; CHECK: .functype load_sext_v4i8_to_v4i32_from_global_address () -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: v128.load32_zero gv_v4i8 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shl -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %v = load <4 x i8>, ptr @gv_v4i8 %v2 = sext <4 x i8> %v to <4 x i32> @@ -1963,10 +1923,10 @@ ; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_global_address: ; CHECK: .functype load_zext_v4i8_to_v4i32_from_global_address () -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: v128.load32_zero gv_v4i8 -; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %v = load <4 x i8>, ptr @gv_v4i8 %v2 = zext <4 x i8> %v to <4 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll @@ -0,0 +1,320 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s + +; Tests that bool vecreduce produces anytrue and alltrue instructions + +target triple = "wasm32-unknown-unknown" + +declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.or.v7i1(<7 x i1>) +declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>) +declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.and.v7i1(<7 x i1>) +declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>) + +; ===================== +; Regular vectors of i1 +; ===================== + +define i1 @test_any_v8i1(<8 x i1> %x) { +; CHECK-LABEL: test_any_v8i1: +; CHECK: .functype test_any_v8i1 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 15 +; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) + ret i1 %ret +} + +define i1 @test_all_v8i1(<8 x i1> %x) { +; CHECK-LABEL: test_all_v8i1: +; CHECK: .functype test_all_v8i1 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 15 +; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: i16x8.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) + ret i1 %ret +} + +define i1 @test_none_v8i1(<8 x i1> %x) { +; CHECK-LABEL: test_none_v8i1: +; CHECK: .functype test_none_v8i1 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 15 +; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push6=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 1 +; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 + %any = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) + %none = xor i1 %any, 1 + ret i1 %none +} + +define i1 @test_not_all_v8i1(<8 x i1> %x) { +; CHECK-LABEL: test_not_all_v8i1: +; CHECK: .functype test_not_all_v8i1 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 15 +; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push6=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i16x8.all_true $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 1 +; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 + %all = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) + %notall = xor i1 %all, 1 + ret i1 %notall +} + +define i1 @test_any_v16i1(<16 x i1> %x) { +; CHECK-LABEL: test_any_v16i1: +; CHECK: .functype test_any_v16i1 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 7 +; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x) + ret i1 %ret +} + +define i1 @test_all_v16i1(<16 x i1> %x) { +; CHECK-LABEL: test_all_v16i1: +; CHECK: .functype test_all_v16i1 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 7 +; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: i8x16.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %ret = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x) + ret i1 %ret +} + +; ================================== +; Regular vectors of larger integers +; ================================== + +define i1 @test_any_v16i8(<16 x i8> %x) { +; CHECK-LABEL: test_any_v16i8: +; CHECK: .functype test_any_v16i8 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 7 +; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <16 x i8> %x to <16 x i1> + %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %bits) + ret i1 %ret +} + +define i1 @test_all_v16i8(<16 x i8> %x) { +; CHECK-LABEL: test_all_v16i8: +; CHECK: .functype test_all_v16i8 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 7 +; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: i8x16.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <16 x i8> %x to <16 x i1> + %ret = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %bits) + ret i1 %ret +} + +define i1 @test_any_v8i16(<8 x i16> %x) { +; CHECK-LABEL: test_any_v8i16: +; CHECK: .functype test_any_v8i16 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 15 +; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <8 x i16> %x to <8 x i1> + %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %bits) + ret i1 %ret +} + +define i1 @test_all_v8i16(<8 x i16> %x) { +; CHECK-LABEL: test_all_v8i16: +; CHECK: .functype test_all_v8i16 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 15 +; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: i16x8.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <8 x i16> %x to <8 x i1> + %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %bits) + ret i1 %ret +} + +define i1 @test_any_v4i32(<4 x i32> %x) { +; CHECK-LABEL: test_any_v4i32: +; CHECK: .functype test_any_v4i32 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 31 +; CHECK-NEXT: i32x4.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 31 +; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <4 x i32> %x to <4 x i1> + %ret = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %bits) + ret i1 %ret +} + +define i1 @test_all_v4i32(<4 x i32> %x) { +; CHECK-LABEL: test_all_v4i32: +; CHECK: .functype test_all_v4i32 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 31 +; CHECK-NEXT: i32x4.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 31 +; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: i32x4.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <4 x i32> %x to <4 x i1> + %ret = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %bits) + ret i1 %ret +} + +define i1 @test_any_v2i64(<2 x i64> %x) { +; CHECK-LABEL: test_any_v2i64: +; CHECK: .functype test_any_v2i64 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 63 +; CHECK-NEXT: i64x2.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 63 +; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <2 x i64> %x to <2 x i1> + %ret = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %bits) + ret i1 %ret +} + +define i1 @test_all_v2i64(<2 x i64> %x) { +; CHECK-LABEL: test_all_v2i64: +; CHECK: .functype test_all_v2i64 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 63 +; CHECK-NEXT: i64x2.shl $push1=, $0, $pop0 +; CHECK-NEXT: i32.const $push4=, 63 +; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop4 +; CHECK-NEXT: i64x2.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %bits = trunc <2 x i64> %x to <2 x i1> + %ret = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %bits) + ret i1 %ret +} + +; ==================== +; Unusual vector sizes +; ==================== + +define i1 @test_any_v7i1(<7 x i1> %x) { +; CHECK-LABEL: test_any_v7i1: +; CHECK: .functype test_any_v7i1 (i32, i32, i32, i32, i32, i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.or $push0=, $0, $1 +; CHECK-NEXT: i32.or $push1=, $pop0, $2 +; CHECK-NEXT: i32.or $push2=, $pop1, $3 +; CHECK-NEXT: i32.or $push3=, $pop2, $4 +; CHECK-NEXT: i32.or $push4=, $pop3, $5 +; CHECK-NEXT: i32.or $push5=, $pop4, $6 +; CHECK-NEXT: return $pop5 + %ret = call i1 @llvm.vector.reduce.or.v7i1(<7 x i1> %x) + ret i1 %ret +} + +define i1 @test_all_v7i1(<7 x i1> %x) { +; CHECK-LABEL: test_all_v7i1: +; CHECK: .functype test_all_v7i1 (i32, i32, i32, i32, i32, i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.and $push0=, $0, $1 +; CHECK-NEXT: i32.and $push1=, $pop0, $2 +; CHECK-NEXT: i32.and $push2=, $pop1, $3 +; CHECK-NEXT: i32.and $push3=, $pop2, $4 +; CHECK-NEXT: i32.and $push4=, $pop3, $5 +; CHECK-NEXT: i32.and $push5=, $pop4, $6 +; CHECK-NEXT: i32.const $push6=, 1 +; CHECK-NEXT: i32.and $push7=, $pop5, $pop6 +; CHECK-NEXT: return $pop7 + %ret = call i1 @llvm.vector.reduce.and.v7i1(<7 x i1> %x) + ret i1 %ret +} + +define i1 @test_any_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_any_v8i8: +; CHECK: .functype test_any_v8i8 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 +; CHECK-NEXT: i32.const $push1=, 15 +; CHECK-NEXT: i16x8.shl $push2=, $pop0, $pop1 +; CHECK-NEXT: i32.const $push5=, 15 +; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop5 +; CHECK-NEXT: v128.any_true $push4=, $pop3 +; CHECK-NEXT: return $pop4 + %bits = trunc <8 x i8> %x to <8 x i1> + %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %bits) + ret i1 %ret +} + +define i1 @test_all_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_all_v8i8: +; CHECK: .functype test_all_v8i8 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 +; CHECK-NEXT: i32.const $push1=, 15 +; CHECK-NEXT: i16x8.shl $push2=, $pop0, $pop1 +; CHECK-NEXT: i32.const $push5=, 15 +; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop5 +; CHECK-NEXT: i16x8.all_true $push4=, $pop3 +; CHECK-NEXT: return $pop4 + %bits = trunc <8 x i8> %x to <8 x i1> + %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %bits) + ret i1 %ret +} + +;; ===================== +;; Test reduce after cmp +;; ===================== + +define i1 @test_cmp_v16i8(<16 x i8> %x) { +; CHECK-LABEL: test_cmp_v16i8: +; CHECK: .functype test_cmp_v16i8 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.eq $push1=, $0, $pop0 +; CHECK-NEXT: v128.any_true $push2=, $pop1 +; CHECK-NEXT: return $pop2 + %zero = icmp eq <16 x i8> %x, zeroinitializer + %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %zero) + ret i1 %ret +}