diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -741,14 +741,20 @@ setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); - // Vector reductions for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + // Vector reductions setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + + // Saturates + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5066,6 +5066,24 @@ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } +multiclass SIMDThreeSameVectorExtraPatterns { + def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)), + (!cast(inst#"v8i8") V64:$LHS, V64:$RHS)>; + def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), + (!cast(inst#"v4i16") V64:$LHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), + (!cast(inst#"v2i32") V64:$LHS, V64:$RHS)>; + + def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v16i8") V128:$LHS, V128:$RHS)>; + def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v8i16") V128:$LHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v4i32") V128:$LHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v2i64") V128:$LHS, V128:$RHS)>; +} + // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS opc, string asm, SDPatternOperator OpNode> { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3839,6 +3839,12 @@ defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", int_aarch64_neon_sqsub>; +// Extra saturate patterns, other than the intrinsics matches above +defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>; +defm : SIMDThreeSameVectorExtraPatterns<"UQADD", uaddsat>; +defm : SIMDThreeSameVectorExtraPatterns<"SQSUB", ssubsat>; +defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>; + defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; diff --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll @@ -88,15 +88,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: mvni v3.4s, #128, lsl #24 -; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y); ret <4 x i32> %tmp; diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -35,15 +35,7 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: movi v3.16b, #127 -; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z @@ -52,24 +44,8 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: add v4.16b, v0.16b, v2.16b -; CHECK-NEXT: cmlt v7.16b, v4.16b, #0 -; CHECK-NEXT: movi v6.16b, #127 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b -; CHECK-NEXT: add v7.16b, v1.16b, v3.16b -; CHECK-NEXT: cmlt v2.16b, v2.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b -; CHECK-NEXT: cmlt v16.16b, v7.16b, #0 -; CHECK-NEXT: movi v5.16b, #127 -; CHECK-NEXT: cmlt v3.16b, v3.16b, #0 -; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b +; CHECK-NEXT: sqadd v0.16b, v0.16b, v2.16b +; CHECK-NEXT: sqadd v1.16b, v1.16b, v3.16b ; CHECK-NEXT: ret %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z @@ -78,42 +54,10 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: add v16.16b, v0.16b, v4.16b -; CHECK-NEXT: cmlt v24.16b, v16.16b, #0 -; CHECK-NEXT: movi v18.16b, #127 -; CHECK-NEXT: add v19.16b, v1.16b, v5.16b -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v18.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.16b, v19.16b, #0 -; CHECK-NEXT: movi v20.16b, #127 -; CHECK-NEXT: add v21.16b, v2.16b, v6.16b -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.16b, v21.16b, #0 -; CHECK-NEXT: cmlt v4.16b, v4.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b -; CHECK-NEXT: movi v22.16b, #127 -; CHECK-NEXT: add v23.16b, v3.16b, v7.16b -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmlt v4.16b, v5.16b, #0 -; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b -; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.16b, v23.16b, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmlt v4.16b, v6.16b, #0 -; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b -; CHECK-NEXT: movi v17.16b, #127 -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmlt v4.16b, v7.16b, #0 -; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b -; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b -; CHECK-NEXT: bsl v3.16b, v17.16b, v23.16b +; CHECK-NEXT: sqadd v0.16b, v0.16b, v4.16b +; CHECK-NEXT: sqadd v1.16b, v1.16b, v5.16b +; CHECK-NEXT: sqadd v2.16b, v2.16b, v6.16b +; CHECK-NEXT: sqadd v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z @@ -122,15 +66,7 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.8h, v0.8h, v1.8h -; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: mvni v3.8h, #128, lsl #8 -; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z @@ -139,24 +75,8 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { ; CHECK-LABEL: v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: add v4.8h, v0.8h, v2.8h -; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 -; CHECK-NEXT: mvni v6.8h, #128, lsl #8 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b -; CHECK-NEXT: add v7.8h, v1.8h, v3.8h -; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h -; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 -; CHECK-NEXT: mvni v5.8h, #128, lsl #8 -; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 -; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b +; CHECK-NEXT: sqadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: sqadd v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z @@ -165,42 +85,10 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: add v16.8h, v0.8h, v4.8h -; CHECK-NEXT: cmlt v24.8h, v16.8h, #0 -; CHECK-NEXT: mvni v18.8h, #128, lsl #8 -; CHECK-NEXT: add v19.8h, v1.8h, v5.8h -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v18.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.8h, v19.8h, #0 -; CHECK-NEXT: mvni v20.8h, #128, lsl #8 -; CHECK-NEXT: add v21.8h, v2.8h, v6.8h -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.8h, v21.8h, #0 -; CHECK-NEXT: cmlt v4.8h, v4.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h -; CHECK-NEXT: mvni v22.8h, #128, lsl #8 -; CHECK-NEXT: add v23.8h, v3.8h, v7.8h -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmlt v4.8h, v5.8h, #0 -; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h -; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.8h, v23.8h, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmlt v4.8h, v6.8h, #0 -; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h -; CHECK-NEXT: mvni v17.8h, #128, lsl #8 -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmlt v4.8h, v7.8h, #0 -; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h -; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b -; CHECK-NEXT: bsl v3.16b, v17.16b, v23.16b +; CHECK-NEXT: sqadd v0.8h, v0.8h, v4.8h +; CHECK-NEXT: sqadd v1.8h, v1.8h, v5.8h +; CHECK-NEXT: sqadd v2.8h, v2.8h, v6.8h +; CHECK-NEXT: sqadd v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z @@ -211,15 +99,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: movi v2.8b, #127 -; CHECK-NEXT: add v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmlt v1.8b, v1.8b, #0 -; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <8 x i8>, <8 x i8>* %px @@ -248,11 +128,10 @@ ; CHECK-NEXT: mov v1.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w10 ; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: movi v1.4h, #127 -; CHECK-NEXT: smin v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mvni v1.4h, #127 -; CHECK-NEXT: smax v0.4h, v0.4h, v1.4h +; CHECK-NEXT: shl v1.4h, v1.4h, #8 +; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -266,19 +145,18 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsb w8, [x0] -; CHECK-NEXT: ldrsb w9, [x1] -; CHECK-NEXT: ldrsb w10, [x0, #1] -; CHECK-NEXT: ldrsb w11, [x1, #1] +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x1] +; CHECK-NEXT: ldrb w10, [x0, #1] +; CHECK-NEXT: ldrb w11, [x1, #1] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2s, #127 -; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mvni v1.2s, #127 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -296,15 +174,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: mvni v2.4h, #128, lsl #8 -; CHECK-NEXT: add v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 -; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <4 x i16>, <4 x i16>* %px @@ -317,19 +187,18 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: ldrsh w9, [x1] -; CHECK-NEXT: ldrsh w10, [x0, #2] -; CHECK-NEXT: ldrsh w11, [x1, #2] +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrh w9, [x1] +; CHECK-NEXT: ldrh w10, [x0, #2] +; CHECK-NEXT: ldrh w11, [x1, #2] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2s, #127, msl #8 -; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mvni v1.2s, #127, msl #8 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -345,15 +214,7 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { ; CHECK-LABEL: v12i8: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: movi v3.16b, #127 -; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z @@ -364,24 +225,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: mvni v5.8h, #128, lsl #8 -; CHECK-NEXT: mvni v4.8h, #128, lsl #8 -; CHECK-NEXT: add v6.8h, v1.8h, v2.8h -; CHECK-NEXT: cmlt v7.8h, v6.8h, #0 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b -; CHECK-NEXT: add v7.8h, v0.8h, v3.8h -; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 -; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h -; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 -; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h -; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b -; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b -; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b +; CHECK-NEXT: sqadd v1.8h, v1.8h, v2.8h +; CHECK-NEXT: sqadd v0.8h, v0.8h, v3.8h ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret @@ -397,15 +242,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: ldr b1, [x1] -; CHECK-NEXT: movi v2.8b, #127 -; CHECK-NEXT: add v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmlt v1.8b, v1.8b, #0 -; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: st1 { v0.b }[0], [x2] ; CHECK-NEXT: ret %x = load <1 x i8>, <1 x i8>* %px @@ -420,15 +257,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ldr h1, [x1] -; CHECK-NEXT: mvni v2.4h, #128, lsl #8 -; CHECK-NEXT: add v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 -; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str h0, [x2] ; CHECK-NEXT: ret %x = load <1 x i16>, <1 x i16>* %px @@ -444,11 +273,11 @@ ; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: sshr v0.16b, v0.16b, #4 -; CHECK-NEXT: movi v2.16b, #7 -; CHECK-NEXT: ssra v0.16b, v1.16b, #4 -; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: movi v1.16b, #248 -; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sshr v1.16b, v1.16b, #4 +; CHECK-NEXT: shl v1.16b, v1.16b, #4 +; CHECK-NEXT: shl v0.16b, v0.16b, #4 +; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sshr v0.16b, v0.16b, #4 ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -460,11 +289,11 @@ ; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: sshr v0.16b, v0.16b, #7 -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: ssra v0.16b, v1.16b, #7 -; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sshr v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z @@ -473,15 +302,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.2s, v0.2s, v1.2s -; CHECK-NEXT: cmlt v4.2s, v2.2s, #0 -; CHECK-NEXT: mvni v3.2s, #128, lsl #24 -; CHECK-NEXT: cmlt v1.2s, v1.2s, #0 -; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z @@ -490,15 +311,7 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: mvni v3.4s, #128, lsl #24 -; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z @@ -507,24 +320,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: add v4.4s, v0.4s, v2.4s -; CHECK-NEXT: cmlt v7.4s, v4.4s, #0 -; CHECK-NEXT: mvni v6.4s, #128, lsl #24 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b -; CHECK-NEXT: add v7.4s, v1.4s, v3.4s -; CHECK-NEXT: cmlt v2.4s, v2.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s -; CHECK-NEXT: cmlt v16.4s, v7.4s, #0 -; CHECK-NEXT: mvni v5.4s, #128, lsl #24 -; CHECK-NEXT: cmlt v3.4s, v3.4s, #0 -; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b +; CHECK-NEXT: sqadd v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sqadd v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z @@ -533,42 +330,10 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: add v16.4s, v0.4s, v4.4s -; CHECK-NEXT: cmlt v24.4s, v16.4s, #0 -; CHECK-NEXT: mvni v18.4s, #128, lsl #24 -; CHECK-NEXT: add v19.4s, v1.4s, v5.4s -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v18.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.4s, v19.4s, #0 -; CHECK-NEXT: mvni v20.4s, #128, lsl #24 -; CHECK-NEXT: add v21.4s, v2.4s, v6.4s -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.4s, v21.4s, #0 -; CHECK-NEXT: cmlt v4.4s, v4.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s -; CHECK-NEXT: mvni v22.4s, #128, lsl #24 -; CHECK-NEXT: add v23.4s, v3.4s, v7.4s -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmlt v4.4s, v5.4s, #0 -; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s -; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.4s, v23.4s, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmlt v4.4s, v6.4s, #0 -; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s -; CHECK-NEXT: mvni v17.4s, #128, lsl #24 -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmlt v4.4s, v7.4s, #0 -; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s -; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b -; CHECK-NEXT: bsl v3.16b, v17.16b, v23.16b +; CHECK-NEXT: sqadd v0.4s, v0.4s, v4.4s +; CHECK-NEXT: sqadd v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sqadd v2.4s, v2.4s, v6.4s +; CHECK-NEXT: sqadd v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y) ret <16 x i32> %z @@ -577,16 +342,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.2d, v0.2d, v1.2d -; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: cmlt v3.2d, v2.2d, #0 -; CHECK-NEXT: cmlt v1.2d, v1.2d, #0 -; CHECK-NEXT: dup v4.2d, x8 -; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d -; CHECK-NEXT: mvn v5.16b, v3.16b -; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b +; CHECK-NEXT: sqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z @@ -595,25 +351,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add v4.2d, v0.2d, v2.2d -; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: cmlt v5.2d, v4.2d, #0 -; CHECK-NEXT: dup v6.2d, x8 -; CHECK-NEXT: mvn v7.16b, v5.16b -; CHECK-NEXT: mov v16.16b, v6.16b -; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b -; CHECK-NEXT: add v5.2d, v1.2d, v3.2d -; CHECK-NEXT: cmlt v2.2d, v2.2d, #0 -; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d -; CHECK-NEXT: cmlt v7.2d, v5.2d, #0 -; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 -; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b +; CHECK-NEXT: sqadd v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sqadd v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -622,43 +361,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add v16.2d, v0.2d, v4.2d -; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: add v17.2d, v1.2d, v5.2d -; CHECK-NEXT: cmlt v20.2d, v16.2d, #0 -; CHECK-NEXT: dup v21.2d, x8 -; CHECK-NEXT: add v18.2d, v2.2d, v6.2d -; CHECK-NEXT: cmlt v22.2d, v17.2d, #0 -; CHECK-NEXT: mvn v24.16b, v20.16b -; CHECK-NEXT: mov v25.16b, v21.16b -; CHECK-NEXT: cmlt v23.2d, v18.2d, #0 -; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b -; CHECK-NEXT: mvn v20.16b, v22.16b -; CHECK-NEXT: mov v24.16b, v21.16b -; CHECK-NEXT: cmlt v4.2d, v4.2d, #0 -; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d -; CHECK-NEXT: add v19.2d, v3.2d, v7.2d -; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b -; CHECK-NEXT: mvn v20.16b, v23.16b -; CHECK-NEXT: mov v22.16b, v21.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmlt v4.2d, v5.2d, #0 -; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d -; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b -; CHECK-NEXT: cmlt v20.2d, v19.2d, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmlt v4.2d, v6.2d, #0 -; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d -; CHECK-NEXT: mvn v23.16b, v20.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmlt v4.2d, v7.2d, #0 -; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d -; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b -; CHECK-NEXT: bsl v3.16b, v21.16b, v19.16b +; CHECK-NEXT: sqadd v0.2d, v0.2d, v4.2d +; CHECK-NEXT: sqadd v1.2d, v1.2d, v5.2d +; CHECK-NEXT: sqadd v2.2d, v2.2d, v6.2d +; CHECK-NEXT: sqadd v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y) ret <8 x i64> %z diff --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll @@ -88,15 +88,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: mvni v3.4s, #128, lsl #24 -; CHECK-NEXT: cmgt v1.4s, v1.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y); ret <4 x i32> %tmp; diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -36,15 +36,7 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: movi v3.16b, #127 -; CHECK-NEXT: cmgt v1.16b, v1.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z @@ -53,24 +45,8 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v4.16b, v0.16b, v2.16b -; CHECK-NEXT: cmlt v7.16b, v4.16b, #0 -; CHECK-NEXT: movi v6.16b, #127 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b -; CHECK-NEXT: sub v7.16b, v1.16b, v3.16b -; CHECK-NEXT: cmgt v2.16b, v2.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b -; CHECK-NEXT: cmlt v16.16b, v7.16b, #0 -; CHECK-NEXT: movi v5.16b, #127 -; CHECK-NEXT: cmgt v3.16b, v3.16b, #0 -; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b +; CHECK-NEXT: sqsub v0.16b, v0.16b, v2.16b +; CHECK-NEXT: sqsub v1.16b, v1.16b, v3.16b ; CHECK-NEXT: ret %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z @@ -79,42 +55,10 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v16.16b, v0.16b, v4.16b -; CHECK-NEXT: cmlt v24.16b, v16.16b, #0 -; CHECK-NEXT: movi v18.16b, #127 -; CHECK-NEXT: sub v19.16b, v1.16b, v5.16b -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v18.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.16b, v19.16b, #0 -; CHECK-NEXT: movi v20.16b, #127 -; CHECK-NEXT: sub v21.16b, v2.16b, v6.16b -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.16b, v21.16b, #0 -; CHECK-NEXT: cmgt v4.16b, v4.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b -; CHECK-NEXT: movi v22.16b, #127 -; CHECK-NEXT: sub v23.16b, v3.16b, v7.16b -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmgt v4.16b, v5.16b, #0 -; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b -; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.16b, v23.16b, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmgt v4.16b, v6.16b, #0 -; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b -; CHECK-NEXT: movi v17.16b, #127 -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmgt v4.16b, v7.16b, #0 -; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b -; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b -; CHECK-NEXT: bsl v3.16b, v17.16b, v23.16b +; CHECK-NEXT: sqsub v0.16b, v0.16b, v4.16b +; CHECK-NEXT: sqsub v1.16b, v1.16b, v5.16b +; CHECK-NEXT: sqsub v2.16b, v2.16b, v6.16b +; CHECK-NEXT: sqsub v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z @@ -123,15 +67,7 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.8h, v0.8h, v1.8h -; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: mvni v3.8h, #128, lsl #8 -; CHECK-NEXT: cmgt v1.8h, v1.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqsub v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z @@ -140,24 +76,8 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { ; CHECK-LABEL: v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v4.8h, v0.8h, v2.8h -; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 -; CHECK-NEXT: mvni v6.8h, #128, lsl #8 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b -; CHECK-NEXT: sub v7.8h, v1.8h, v3.8h -; CHECK-NEXT: cmgt v2.8h, v2.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h -; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 -; CHECK-NEXT: mvni v5.8h, #128, lsl #8 -; CHECK-NEXT: cmgt v3.8h, v3.8h, #0 -; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b +; CHECK-NEXT: sqsub v0.8h, v0.8h, v2.8h +; CHECK-NEXT: sqsub v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z @@ -166,42 +86,10 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v16.8h, v0.8h, v4.8h -; CHECK-NEXT: cmlt v24.8h, v16.8h, #0 -; CHECK-NEXT: mvni v18.8h, #128, lsl #8 -; CHECK-NEXT: sub v19.8h, v1.8h, v5.8h -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v18.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.8h, v19.8h, #0 -; CHECK-NEXT: mvni v20.8h, #128, lsl #8 -; CHECK-NEXT: sub v21.8h, v2.8h, v6.8h -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.8h, v21.8h, #0 -; CHECK-NEXT: cmgt v4.8h, v4.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h -; CHECK-NEXT: mvni v22.8h, #128, lsl #8 -; CHECK-NEXT: sub v23.8h, v3.8h, v7.8h -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmgt v4.8h, v5.8h, #0 -; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h -; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.8h, v23.8h, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmgt v4.8h, v6.8h, #0 -; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h -; CHECK-NEXT: mvni v17.8h, #128, lsl #8 -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmgt v4.8h, v7.8h, #0 -; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h -; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b -; CHECK-NEXT: bsl v3.16b, v17.16b, v23.16b +; CHECK-NEXT: sqsub v0.8h, v0.8h, v4.8h +; CHECK-NEXT: sqsub v1.8h, v1.8h, v5.8h +; CHECK-NEXT: sqsub v2.8h, v2.8h, v6.8h +; CHECK-NEXT: sqsub v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z @@ -212,15 +100,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: movi v2.8b, #127 -; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmgt v1.8b, v1.8b, #0 -; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqsub v0.8b, v0.8b, v1.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <8 x i8>, <8 x i8>* %px @@ -249,11 +129,10 @@ ; CHECK-NEXT: mov v1.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w10 ; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: movi v1.4h, #127 -; CHECK-NEXT: smin v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mvni v1.4h, #127 -; CHECK-NEXT: smax v0.4h, v0.4h, v1.4h +; CHECK-NEXT: shl v1.4h, v1.4h, #8 +; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -267,19 +146,18 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsb w8, [x0] -; CHECK-NEXT: ldrsb w9, [x1] -; CHECK-NEXT: ldrsb w10, [x0, #1] -; CHECK-NEXT: ldrsb w11, [x1, #1] +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x1] +; CHECK-NEXT: ldrb w10, [x0, #1] +; CHECK-NEXT: ldrb w11, [x1, #1] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2s, #127 -; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mvni v1.2s, #127 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -297,15 +175,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: mvni v2.4h, #128, lsl #8 -; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmgt v1.4h, v1.4h, #0 -; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <4 x i16>, <4 x i16>* %px @@ -318,19 +188,18 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: ldrsh w9, [x1] -; CHECK-NEXT: ldrsh w10, [x0, #2] -; CHECK-NEXT: ldrsh w11, [x1, #2] +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrh w9, [x1] +; CHECK-NEXT: ldrh w10, [x0, #2] +; CHECK-NEXT: ldrh w11, [x1, #2] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2s, #127, msl #8 -; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mvni v1.2s, #127, msl #8 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -346,15 +215,7 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { ; CHECK-LABEL: v12i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: movi v3.16b, #127 -; CHECK-NEXT: cmgt v1.16b, v1.16b, #0 -; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z @@ -365,24 +226,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: mvni v5.8h, #128, lsl #8 -; CHECK-NEXT: mvni v4.8h, #128, lsl #8 -; CHECK-NEXT: sub v6.8h, v1.8h, v2.8h -; CHECK-NEXT: cmlt v7.8h, v6.8h, #0 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b -; CHECK-NEXT: sub v7.8h, v0.8h, v3.8h -; CHECK-NEXT: cmgt v2.8h, v2.8h, #0 -; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h -; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 -; CHECK-NEXT: cmgt v3.8h, v3.8h, #0 -; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h -; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b -; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b -; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b +; CHECK-NEXT: sqsub v1.8h, v1.8h, v2.8h +; CHECK-NEXT: sqsub v0.8h, v0.8h, v3.8h ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret @@ -398,15 +243,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: ldr b1, [x1] -; CHECK-NEXT: movi v2.8b, #127 -; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmgt v1.8b, v1.8b, #0 -; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqsub v0.8b, v0.8b, v1.8b ; CHECK-NEXT: st1 { v0.b }[0], [x2] ; CHECK-NEXT: ret %x = load <1 x i8>, <1 x i8>* %px @@ -421,15 +258,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ldr h1, [x1] -; CHECK-NEXT: mvni v2.4h, #128, lsl #8 -; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmgt v1.4h, v1.4h, #0 -; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b +; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str h0, [x2] ; CHECK-NEXT: ret %x = load <1 x i16>, <1 x i16>* %px @@ -442,15 +271,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: shl v0.16b, v0.16b, #4 +; CHECK-NEXT: shl v1.16b, v1.16b, #4 +; CHECK-NEXT: sshr v0.16b, v0.16b, #4 ; CHECK-NEXT: sshr v1.16b, v1.16b, #4 +; CHECK-NEXT: shl v1.16b, v1.16b, #4 +; CHECK-NEXT: shl v0.16b, v0.16b, #4 +; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b ; CHECK-NEXT: sshr v0.16b, v0.16b, #4 -; CHECK-NEXT: movi v2.16b, #7 -; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b -; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: movi v1.16b, #248 -; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -459,15 +287,14 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; CHECK-LABEL: v16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: sshr v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b ; CHECK-NEXT: sshr v0.16b, v0.16b, #7 -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b -; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z @@ -476,15 +303,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.2s, v0.2s, v1.2s -; CHECK-NEXT: cmlt v4.2s, v2.2s, #0 -; CHECK-NEXT: mvni v3.2s, #128, lsl #24 -; CHECK-NEXT: cmgt v1.2s, v1.2s, #0 -; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s -; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b -; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b -; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z @@ -493,15 +312,7 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: mvni v3.4s, #128, lsl #24 -; CHECK-NEXT: cmgt v1.4s, v1.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s -; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z @@ -510,24 +321,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v4.4s, v0.4s, v2.4s -; CHECK-NEXT: cmlt v7.4s, v4.4s, #0 -; CHECK-NEXT: mvni v6.4s, #128, lsl #24 -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b -; CHECK-NEXT: sub v7.4s, v1.4s, v3.4s -; CHECK-NEXT: cmgt v2.4s, v2.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s -; CHECK-NEXT: cmlt v16.4s, v7.4s, #0 -; CHECK-NEXT: mvni v5.4s, #128, lsl #24 -; CHECK-NEXT: cmgt v3.4s, v3.4s, #0 -; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v16.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b +; CHECK-NEXT: sqsub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sqsub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z @@ -536,42 +331,10 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v16.4s, v0.4s, v4.4s -; CHECK-NEXT: cmlt v24.4s, v16.4s, #0 -; CHECK-NEXT: mvni v18.4s, #128, lsl #24 -; CHECK-NEXT: sub v19.4s, v1.4s, v5.4s -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v18.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.4s, v19.4s, #0 -; CHECK-NEXT: mvni v20.4s, #128, lsl #24 -; CHECK-NEXT: sub v21.4s, v2.4s, v6.4s -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.4s, v21.4s, #0 -; CHECK-NEXT: cmgt v4.4s, v4.4s, #0 -; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s -; CHECK-NEXT: mvni v22.4s, #128, lsl #24 -; CHECK-NEXT: sub v23.4s, v3.4s, v7.4s -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmgt v4.4s, v5.4s, #0 -; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s -; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b -; CHECK-NEXT: cmlt v24.4s, v23.4s, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmgt v4.4s, v6.4s, #0 -; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s -; CHECK-NEXT: mvni v17.4s, #128, lsl #24 -; CHECK-NEXT: mvn v25.16b, v24.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmgt v4.4s, v7.4s, #0 -; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s -; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b -; CHECK-NEXT: bsl v3.16b, v17.16b, v23.16b +; CHECK-NEXT: sqsub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: sqsub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sqsub v2.4s, v2.4s, v6.4s +; CHECK-NEXT: sqsub v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y) ret <16 x i32> %z @@ -580,16 +343,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.2d, v0.2d, v1.2d -; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: cmlt v3.2d, v2.2d, #0 -; CHECK-NEXT: cmgt v1.2d, v1.2d, #0 -; CHECK-NEXT: dup v4.2d, x8 -; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d -; CHECK-NEXT: mvn v5.16b, v3.16b -; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b +; CHECK-NEXT: sqsub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z @@ -598,25 +352,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v4.2d, v0.2d, v2.2d -; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: cmlt v5.2d, v4.2d, #0 -; CHECK-NEXT: dup v6.2d, x8 -; CHECK-NEXT: mvn v7.16b, v5.16b -; CHECK-NEXT: mov v16.16b, v6.16b -; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b -; CHECK-NEXT: sub v5.2d, v1.2d, v3.2d -; CHECK-NEXT: cmgt v2.2d, v2.2d, #0 -; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d -; CHECK-NEXT: cmlt v7.2d, v5.2d, #0 -; CHECK-NEXT: cmgt v3.2d, v3.2d, #0 -; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mvn v2.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b -; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b +; CHECK-NEXT: sqsub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sqsub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -625,43 +362,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v16.2d, v0.2d, v4.2d -; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: sub v17.2d, v1.2d, v5.2d -; CHECK-NEXT: cmlt v20.2d, v16.2d, #0 -; CHECK-NEXT: dup v21.2d, x8 -; CHECK-NEXT: sub v18.2d, v2.2d, v6.2d -; CHECK-NEXT: cmlt v22.2d, v17.2d, #0 -; CHECK-NEXT: mvn v24.16b, v20.16b -; CHECK-NEXT: mov v25.16b, v21.16b -; CHECK-NEXT: cmlt v23.2d, v18.2d, #0 -; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b -; CHECK-NEXT: mvn v20.16b, v22.16b -; CHECK-NEXT: mov v24.16b, v21.16b -; CHECK-NEXT: cmgt v4.2d, v4.2d, #0 -; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d -; CHECK-NEXT: sub v19.2d, v3.2d, v7.2d -; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b -; CHECK-NEXT: mvn v20.16b, v23.16b -; CHECK-NEXT: mov v22.16b, v21.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: cmgt v4.2d, v5.2d, #0 -; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d -; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b -; CHECK-NEXT: cmlt v20.2d, v19.2d, #0 -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmgt v4.2d, v6.2d, #0 -; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d -; CHECK-NEXT: mvn v23.16b, v20.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: cmgt v4.2d, v7.2d, #0 -; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d -; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b -; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b -; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b -; CHECK-NEXT: bsl v3.16b, v21.16b, v19.16b +; CHECK-NEXT: sqsub v0.2d, v0.2d, v4.2d +; CHECK-NEXT: sqsub v1.2d, v1.2d, v5.2d +; CHECK-NEXT: sqsub v2.2d, v2.2d, v6.2d +; CHECK-NEXT: sqsub v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y) ret <8 x i64> %z diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -35,9 +35,7 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v2.16b, v1.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z @@ -46,12 +44,8 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v4.16b, v2.16b -; CHECK-NEXT: mvn v5.16b, v3.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v4.16b -; CHECK-NEXT: umin v1.16b, v1.16b, v5.16b -; CHECK-NEXT: add v0.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.16b, v1.16b, v3.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v2.16b +; CHECK-NEXT: uqadd v1.16b, v1.16b, v3.16b ; CHECK-NEXT: ret %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z @@ -60,18 +54,10 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v16.16b, v4.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v16.16b -; CHECK-NEXT: mvn v16.16b, v5.16b -; CHECK-NEXT: umin v1.16b, v1.16b, v16.16b -; CHECK-NEXT: mvn v16.16b, v6.16b -; CHECK-NEXT: umin v2.16b, v2.16b, v16.16b -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: umin v3.16b, v3.16b, v16.16b -; CHECK-NEXT: add v0.16b, v0.16b, v4.16b -; CHECK-NEXT: add v1.16b, v1.16b, v5.16b -; CHECK-NEXT: add v2.16b, v2.16b, v6.16b -; CHECK-NEXT: add v3.16b, v3.16b, v7.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v4.16b +; CHECK-NEXT: uqadd v1.16b, v1.16b, v5.16b +; CHECK-NEXT: uqadd v2.16b, v2.16b, v6.16b +; CHECK-NEXT: uqadd v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z @@ -80,9 +66,7 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v2.16b, v1.16b -; CHECK-NEXT: umin v0.8h, v0.8h, v2.8h -; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %z = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z @@ -91,12 +75,8 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { ; CHECK-LABEL: v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v4.16b, v2.16b -; CHECK-NEXT: mvn v5.16b, v3.16b -; CHECK-NEXT: umin v0.8h, v0.8h, v4.8h -; CHECK-NEXT: umin v1.8h, v1.8h, v5.8h -; CHECK-NEXT: add v0.8h, v0.8h, v2.8h -; CHECK-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-NEXT: uqadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uqadd v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z @@ -105,18 +85,10 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v16.16b, v4.16b -; CHECK-NEXT: umin v0.8h, v0.8h, v16.8h -; CHECK-NEXT: mvn v16.16b, v5.16b -; CHECK-NEXT: umin v1.8h, v1.8h, v16.8h -; CHECK-NEXT: mvn v16.16b, v6.16b -; CHECK-NEXT: umin v2.8h, v2.8h, v16.8h -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: umin v3.8h, v3.8h, v16.8h -; CHECK-NEXT: add v0.8h, v0.8h, v4.8h -; CHECK-NEXT: add v1.8h, v1.8h, v5.8h -; CHECK-NEXT: add v2.8h, v2.8h, v6.8h -; CHECK-NEXT: add v3.8h, v3.8h, v7.8h +; CHECK-NEXT: uqadd v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uqadd v1.8h, v1.8h, v5.8h +; CHECK-NEXT: uqadd v2.8h, v2.8h, v6.8h +; CHECK-NEXT: uqadd v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z @@ -125,11 +97,9 @@ define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { ; CHECK-LABEL: v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: mvn v2.8b, v0.8b -; CHECK-NEXT: umin v1.8b, v1.8b, v2.8b -; CHECK-NEXT: add v0.8b, v1.8b, v0.8b +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uqadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <8 x i8>, <8 x i8>* %px @@ -146,21 +116,22 @@ ; CHECK-NEXT: ldrb w9, [x1] ; CHECK-NEXT: ldrb w10, [x0, #1] ; CHECK-NEXT: ldrb w11, [x1, #1] -; CHECK-NEXT: ldrb w12, [x0, #2] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x1, #2] ; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrb w9, [x1, #2] ; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: ldrb w9, [x0, #3] -; CHECK-NEXT: ldrb w10, [x1, #3] ; CHECK-NEXT: mov v1.h[1], w11 -; CHECK-NEXT: mov v0.h[2], w12 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: mov v0.h[3], w9 -; CHECK-NEXT: mov v1.h[3], w10 -; CHECK-NEXT: movi d2, #0xff00ff00ff00ff -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h +; CHECK-NEXT: ldrb w10, [x0, #3] +; CHECK-NEXT: ldrb w11, [x1, #3] +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v1.h[3], w11 +; CHECK-NEXT: shl v1.4h, v1.4h, #8 +; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ushr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -179,12 +150,13 @@ ; CHECK-NEXT: ldrb w10, [x0, #1] ; CHECK-NEXT: ldrb w11, [x1, #1] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: movi d1, #0x0000ff000000ff -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mov v1.s[1], w11 +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -200,11 +172,9 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { ; CHECK-LABEL: v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: mvn v2.8b, v0.8b -; CHECK-NEXT: umin v1.4h, v1.4h, v2.4h -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <4 x i16>, <4 x i16>* %px @@ -222,12 +192,13 @@ ; CHECK-NEXT: ldrh w10, [x0, #2] ; CHECK-NEXT: ldrh w11, [x1, #2] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mov v1.s[1], w11 +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -243,9 +214,7 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { ; CHECK-LABEL: v12i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v2.16b, v1.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <12 x i8> @llvm.uadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z @@ -254,16 +223,12 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { ; CHECK-LABEL: v12i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x1] -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: mvn v4.16b, v0.16b -; CHECK-NEXT: mvn v5.16b, v1.16b -; CHECK-NEXT: umin v2.8h, v2.8h, v4.8h -; CHECK-NEXT: umin v3.8h, v3.8h, v5.8h -; CHECK-NEXT: add v0.8h, v2.8h, v0.8h -; CHECK-NEXT: add v1.8h, v3.8h, v1.8h -; CHECK-NEXT: str q1, [x2] -; CHECK-NEXT: str d0, [x2, #16] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: uqadd v1.8h, v1.8h, v2.8h +; CHECK-NEXT: uqadd v0.8h, v0.8h, v3.8h +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret %x = load <12 x i16>, <12 x i16>* %px %y = load <12 x i16>, <12 x i16>* %py @@ -275,11 +240,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind { ; CHECK-LABEL: v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr b0, [x1] -; CHECK-NEXT: ldr b1, [x0] -; CHECK-NEXT: mvn v2.8b, v0.8b -; CHECK-NEXT: umin v1.8b, v1.8b, v2.8b -; CHECK-NEXT: add v0.8b, v1.8b, v0.8b +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: ldr b1, [x1] +; CHECK-NEXT: uqadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: st1 { v0.b }[0], [x2] ; CHECK-NEXT: ret %x = load <1 x i8>, <1 x i8>* %px @@ -292,11 +255,9 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind { ; CHECK-LABEL: v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr h0, [x1] -; CHECK-NEXT: ldr h1, [x0] -; CHECK-NEXT: mvn v2.8b, v0.8b -; CHECK-NEXT: umin v1.4h, v1.4h, v2.4h -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ldr h1, [x1] +; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str h0, [x2] ; CHECK-NEXT: ret %x = load <1 x i16>, <1 x i16>* %px @@ -310,10 +271,12 @@ ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #15 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #4 +; CHECK-NEXT: shl v0.16b, v0.16b, #4 +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushr v0.16b, v0.16b, #4 ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -323,10 +286,12 @@ ; CHECK-LABEL: v16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #1 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushr v0.16b, v0.16b, #7 ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z @@ -335,9 +300,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v2.8b, v1.8b -; CHECK-NEXT: umin v0.2s, v0.2s, v2.2s -; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z @@ -346,9 +309,7 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v2.16b, v1.16b -; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %z = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z @@ -357,12 +318,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v4.16b, v2.16b -; CHECK-NEXT: mvn v5.16b, v3.16b -; CHECK-NEXT: umin v0.4s, v0.4s, v4.4s -; CHECK-NEXT: umin v1.4s, v1.4s, v5.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: uqadd v0.4s, v0.4s, v2.4s +; CHECK-NEXT: uqadd v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %z = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z @@ -371,18 +328,10 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v16.16b, v4.16b -; CHECK-NEXT: umin v0.4s, v0.4s, v16.4s -; CHECK-NEXT: mvn v16.16b, v5.16b -; CHECK-NEXT: umin v1.4s, v1.4s, v16.4s -; CHECK-NEXT: mvn v16.16b, v6.16b -; CHECK-NEXT: umin v2.4s, v2.4s, v16.4s -; CHECK-NEXT: mvn v16.16b, v7.16b -; CHECK-NEXT: umin v3.4s, v3.4s, v16.4s -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: add v2.4s, v2.4s, v6.4s -; CHECK-NEXT: add v3.4s, v3.4s, v7.4s +; CHECK-NEXT: uqadd v0.4s, v0.4s, v4.4s +; CHECK-NEXT: uqadd v1.4s, v1.4s, v5.4s +; CHECK-NEXT: uqadd v2.4s, v2.4s, v6.4s +; CHECK-NEXT: uqadd v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y) ret <16 x i32> %z @@ -391,9 +340,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %z = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z @@ -402,12 +349,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.2d, v0.2d, v2.2d -; CHECK-NEXT: add v3.2d, v1.2d, v3.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v2.2d -; CHECK-NEXT: cmhi v1.2d, v1.2d, v3.2d -; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b -; CHECK-NEXT: orr v1.16b, v3.16b, v1.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v2.2d +; CHECK-NEXT: uqadd v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %z = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -416,18 +359,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add v4.2d, v0.2d, v4.2d -; CHECK-NEXT: add v5.2d, v1.2d, v5.2d -; CHECK-NEXT: add v6.2d, v2.2d, v6.2d -; CHECK-NEXT: add v7.2d, v3.2d, v7.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v4.2d -; CHECK-NEXT: cmhi v1.2d, v1.2d, v5.2d -; CHECK-NEXT: cmhi v2.2d, v2.2d, v6.2d -; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d -; CHECK-NEXT: orr v0.16b, v4.16b, v0.16b -; CHECK-NEXT: orr v1.16b, v5.16b, v1.16b -; CHECK-NEXT: orr v2.16b, v6.16b, v2.16b -; CHECK-NEXT: orr v3.16b, v7.16b, v3.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v4.2d +; CHECK-NEXT: uqadd v1.2d, v1.2d, v5.2d +; CHECK-NEXT: uqadd v2.2d, v2.2d, v6.2d +; CHECK-NEXT: uqadd v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y) ret <8 x i64> %z diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -36,8 +36,7 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z @@ -46,10 +45,8 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umax v1.16b, v1.16b, v3.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v2.16b -; CHECK-NEXT: sub v1.16b, v1.16b, v3.16b +; CHECK-NEXT: uqsub v0.16b, v0.16b, v2.16b +; CHECK-NEXT: uqsub v1.16b, v1.16b, v3.16b ; CHECK-NEXT: ret %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z @@ -58,14 +55,10 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.16b, v0.16b, v4.16b -; CHECK-NEXT: umax v1.16b, v1.16b, v5.16b -; CHECK-NEXT: umax v2.16b, v2.16b, v6.16b -; CHECK-NEXT: umax v3.16b, v3.16b, v7.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v4.16b -; CHECK-NEXT: sub v1.16b, v1.16b, v5.16b -; CHECK-NEXT: sub v2.16b, v2.16b, v6.16b -; CHECK-NEXT: sub v3.16b, v3.16b, v7.16b +; CHECK-NEXT: uqsub v0.16b, v0.16b, v4.16b +; CHECK-NEXT: uqsub v1.16b, v1.16b, v5.16b +; CHECK-NEXT: uqsub v2.16b, v2.16b, v6.16b +; CHECK-NEXT: uqsub v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z @@ -74,8 +67,7 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.8h, v0.8h, v1.8h -; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uqsub v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %z = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z @@ -84,10 +76,8 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { ; CHECK-LABEL: v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.8h, v0.8h, v2.8h -; CHECK-NEXT: umax v1.8h, v1.8h, v3.8h -; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h -; CHECK-NEXT: sub v1.8h, v1.8h, v3.8h +; CHECK-NEXT: uqsub v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uqsub v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z @@ -96,14 +86,10 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.8h, v0.8h, v4.8h -; CHECK-NEXT: umax v1.8h, v1.8h, v5.8h -; CHECK-NEXT: umax v2.8h, v2.8h, v6.8h -; CHECK-NEXT: umax v3.8h, v3.8h, v7.8h -; CHECK-NEXT: sub v0.8h, v0.8h, v4.8h -; CHECK-NEXT: sub v1.8h, v1.8h, v5.8h -; CHECK-NEXT: sub v2.8h, v2.8h, v6.8h -; CHECK-NEXT: sub v3.8h, v3.8h, v7.8h +; CHECK-NEXT: uqsub v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uqsub v1.8h, v1.8h, v5.8h +; CHECK-NEXT: uqsub v2.8h, v2.8h, v6.8h +; CHECK-NEXT: uqsub v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z @@ -114,8 +100,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umax v0.8b, v0.8b, v1.8b -; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b +; CHECK-NEXT: uqsub v0.8b, v0.8b, v1.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <8 x i8>, <8 x i8>* %px @@ -144,8 +129,10 @@ ; CHECK-NEXT: mov v1.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w10 ; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: umax v0.4h, v0.4h, v1.4h -; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: shl v1.4h, v1.4h, #8 +; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ushr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -167,8 +154,10 @@ ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: umax v0.2s, v0.2s, v1.2s -; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -186,8 +175,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umax v0.4h, v0.4h, v1.4h -; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret %x = load <4 x i16>, <4 x i16>* %px @@ -208,8 +196,10 @@ ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: umax v0.2s, v0.2s, v1.2s -; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -225,8 +215,7 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { ; CHECK-LABEL: v12i8: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <12 x i8> @llvm.usub.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z @@ -237,10 +226,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: umax v1.8h, v1.8h, v2.8h -; CHECK-NEXT: umax v0.8h, v0.8h, v3.8h -; CHECK-NEXT: sub v1.8h, v1.8h, v2.8h -; CHECK-NEXT: sub v0.8h, v0.8h, v3.8h +; CHECK-NEXT: uqsub v1.8h, v1.8h, v2.8h +; CHECK-NEXT: uqsub v0.8h, v0.8h, v3.8h ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret @@ -256,8 +243,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: ldr b1, [x1] -; CHECK-NEXT: umax v0.8b, v0.8b, v1.8b -; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b +; CHECK-NEXT: uqsub v0.8b, v0.8b, v1.8b ; CHECK-NEXT: st1 { v0.b }[0], [x2] ; CHECK-NEXT: ret %x = load <1 x i8>, <1 x i8>* %px @@ -272,8 +258,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ldr h1, [x1] -; CHECK-NEXT: umax v0.4h, v0.4h, v1.4h -; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: str h0, [x2] ; CHECK-NEXT: ret %x = load <1 x i16>, <1 x i16>* %px @@ -287,10 +272,12 @@ ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #15 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #4 +; CHECK-NEXT: shl v0.16b, v0.16b, #4 +; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushr v0.16b, v0.16b, #4 ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -300,10 +287,12 @@ ; CHECK-LABEL: v16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #1 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushr v0.16b, v0.16b, #7 ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z @@ -312,8 +301,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.2s, v0.2s, v1.2s -; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z @@ -322,8 +310,7 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uqsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z @@ -332,10 +319,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s -; CHECK-NEXT: umax v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s +; CHECK-NEXT: uqsub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: uqsub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z @@ -344,14 +329,10 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.4s, v0.4s, v4.4s -; CHECK-NEXT: umax v1.4s, v1.4s, v5.4s -; CHECK-NEXT: umax v2.4s, v2.4s, v6.4s -; CHECK-NEXT: umax v3.4s, v3.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s +; CHECK-NEXT: uqsub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: uqsub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: uqsub v2.4s, v2.4s, v6.4s +; CHECK-NEXT: uqsub v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y) ret <16 x i32> %z @@ -360,9 +341,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqsub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %z = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z @@ -371,12 +350,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v2.2d, v0.2d, v2.2d -; CHECK-NEXT: sub v3.2d, v1.2d, v3.2d -; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d -; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d -; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b -; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b +; CHECK-NEXT: uqsub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: uqsub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -385,18 +360,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v4.2d, v0.2d, v4.2d -; CHECK-NEXT: sub v5.2d, v1.2d, v5.2d -; CHECK-NEXT: sub v6.2d, v2.2d, v6.2d -; CHECK-NEXT: sub v7.2d, v3.2d, v7.2d -; CHECK-NEXT: cmhi v0.2d, v4.2d, v0.2d -; CHECK-NEXT: cmhi v1.2d, v5.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v6.2d, v2.2d -; CHECK-NEXT: cmhi v3.2d, v7.2d, v3.2d -; CHECK-NEXT: bic v0.16b, v4.16b, v0.16b -; CHECK-NEXT: bic v1.16b, v5.16b, v1.16b -; CHECK-NEXT: bic v2.16b, v6.16b, v2.16b -; CHECK-NEXT: bic v3.16b, v7.16b, v3.16b +; CHECK-NEXT: uqsub v0.2d, v0.2d, v4.2d +; CHECK-NEXT: uqsub v1.2d, v1.2d, v5.2d +; CHECK-NEXT: uqsub v2.2d, v2.2d, v6.2d +; CHECK-NEXT: uqsub v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y) ret <8 x i64> %z