Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6476,24 +6476,19 @@ SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType()); - // LHSSign -> LHS >= 0 - // RHSSign -> RHS >= 0 - // SumSign -> Result >= 0 - // - // Add: - // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) - // Sub: - // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) - SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE); - SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE); - SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign, - IsAdd ? ISD::SETEQ : ISD::SETNE); - - SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE); - SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE); - - SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE); - Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType); + // For an addition, the result should be less than one of the operands (LHS) + // if and only if the other operand (RHS) is negative, otherwise there will + // be overflow. + // For a subtraction, the result should be less than one of the operands + // (LHS) if and only if the other operand (RHS) is (non-zero) positive, + // otherwise there will be overflow. + SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT); + SDValue ConditionRHS = + DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT); + + Overflow = DAG.getBoolExtOrTrunc( + DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl, + ResultType, ResultType); } bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result, Index: llvm/test/CodeGen/AArch64/sadd_sat.ll =================================================================== --- llvm/test/CodeGen/AArch64/sadd_sat.ll +++ llvm/test/CodeGen/AArch64/sadd_sat.ll @@ -54,17 +54,13 @@ ; CHECK-LABEL: vec: ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v5.4s, v2.4s, #0 ; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s ; CHECK-NEXT: mvni v3.4s, #128, lsl #24 +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y); Index: llvm/test/CodeGen/AArch64/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -36,17 +36,13 @@ ; CHECK-LABEL: v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v2.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v3.16b, #127 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) @@ -57,29 +53,21 @@ ; CHECK-LABEL: v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: add v4.16b, v0.16b, v2.16b -; CHECK-NEXT: cmlt v16.16b, v4.16b, #0 +; CHECK-NEXT: cmlt v7.16b, v4.16b, #0 ; CHECK-NEXT: movi v6.16b, #127 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b ; CHECK-NEXT: add v7.16b, v1.16b, v3.16b -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b +; CHECK-NEXT: cmlt v2.16b, v2.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b ; CHECK-NEXT: cmlt v16.16b, v7.16b, #0 ; CHECK-NEXT: movi v5.16b, #127 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.16b, v2.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v16.16b, v4.16b, #0 -; CHECK-NEXT: cmge v3.16b, v3.16b, #0 -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmeq v2.16b, v0.16b, v2.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v16.16b -; CHECK-NEXT: cmge v16.16b, v7.16b, #0 -; CHECK-NEXT: cmeq v3.16b, v1.16b, v3.16b -; CHECK-NEXT: cmeq v1.16b, v1.16b, v16.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: cmlt v3.16b, v3.16b, #0 +; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b ; CHECK-NEXT: ret @@ -102,42 +90,26 @@ ; CHECK-NEXT: mvn v25.16b, v24.16b ; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.16b, v21.16b, #0 +; CHECK-NEXT: cmlt v4.16b, v4.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b ; CHECK-NEXT: movi v22.16b, #127 ; CHECK-NEXT: add v23.16b, v3.16b, v7.16b ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmlt v4.16b, v5.16b, #0 +; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b ; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.16b, v23.16b, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmlt v4.16b, v6.16b, #0 +; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b ; CHECK-NEXT: movi v17.16b, #127 ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmlt v4.16b, v7.16b, #0 +; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b ; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: cmge v4.16b, v4.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v24.16b, v16.16b, #0 -; CHECK-NEXT: cmge v5.16b, v5.16b, #0 -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v24.16b -; CHECK-NEXT: cmge v24.16b, v19.16b, #0 -; CHECK-NEXT: cmge v6.16b, v6.16b, #0 -; CHECK-NEXT: cmge v2.16b, v2.16b, #0 -; CHECK-NEXT: cmeq v5.16b, v1.16b, v5.16b -; CHECK-NEXT: cmeq v1.16b, v1.16b, v24.16b -; CHECK-NEXT: cmge v24.16b, v21.16b, #0 -; CHECK-NEXT: cmge v7.16b, v7.16b, #0 -; CHECK-NEXT: cmge v3.16b, v3.16b, #0 -; CHECK-NEXT: cmeq v6.16b, v2.16b, v6.16b -; CHECK-NEXT: cmeq v2.16b, v2.16b, v24.16b -; CHECK-NEXT: cmge v24.16b, v23.16b, #0 -; CHECK-NEXT: cmeq v7.16b, v3.16b, v7.16b -; CHECK-NEXT: cmeq v3.16b, v3.16b, v24.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b -; CHECK-NEXT: and v2.16b, v6.16b, v2.16b -; CHECK-NEXT: and v3.16b, v7.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b @@ -151,17 +123,13 @@ ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.8h, v0.8h, v1.8h -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmge v5.8h, v2.8h, #0 ; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: cmeq v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v5.8h ; CHECK-NEXT: mvni v3.8h, #128, lsl #8 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) @@ -172,29 +140,21 @@ ; CHECK-LABEL: v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: add v4.8h, v0.8h, v2.8h -; CHECK-NEXT: cmlt v16.8h, v4.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: mvni v6.8h, #128, lsl #8 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b ; CHECK-NEXT: add v7.8h, v1.8h, v3.8h -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b +; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h ; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 ; CHECK-NEXT: mvni v5.8h, #128, lsl #8 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.8h, v2.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmge v16.8h, v4.8h, #0 -; CHECK-NEXT: cmge v3.8h, v3.8h, #0 -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmeq v2.8h, v0.8h, v2.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h -; CHECK-NEXT: cmge v16.8h, v7.8h, #0 -; CHECK-NEXT: cmeq v3.8h, v1.8h, v3.8h -; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 +; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b ; CHECK-NEXT: ret @@ -217,42 +177,26 @@ ; CHECK-NEXT: mvn v25.16b, v24.16b ; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.8h, v21.8h, #0 +; CHECK-NEXT: cmlt v4.8h, v4.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h ; CHECK-NEXT: mvni v22.8h, #128, lsl #8 ; CHECK-NEXT: add v23.8h, v3.8h, v7.8h ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmlt v4.8h, v5.8h, #0 +; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h ; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.8h, v23.8h, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmlt v4.8h, v6.8h, #0 +; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h ; CHECK-NEXT: mvni v17.8h, #128, lsl #8 ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmlt v4.8h, v7.8h, #0 +; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h ; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: cmge v4.8h, v4.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmge v24.8h, v16.8h, #0 -; CHECK-NEXT: cmge v5.8h, v5.8h, #0 -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmeq v4.8h, v0.8h, v4.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v24.8h -; CHECK-NEXT: cmge v24.8h, v19.8h, #0 -; CHECK-NEXT: cmge v6.8h, v6.8h, #0 -; CHECK-NEXT: cmge v2.8h, v2.8h, #0 -; CHECK-NEXT: cmeq v5.8h, v1.8h, v5.8h -; CHECK-NEXT: cmeq v1.8h, v1.8h, v24.8h -; CHECK-NEXT: cmge v24.8h, v21.8h, #0 -; CHECK-NEXT: cmge v7.8h, v7.8h, #0 -; CHECK-NEXT: cmge v3.8h, v3.8h, #0 -; CHECK-NEXT: cmeq v6.8h, v2.8h, v6.8h -; CHECK-NEXT: cmeq v2.8h, v2.8h, v24.8h -; CHECK-NEXT: cmge v24.8h, v23.8h, #0 -; CHECK-NEXT: cmeq v7.8h, v3.8h, v7.8h -; CHECK-NEXT: cmeq v3.8h, v3.8h, v24.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b -; CHECK-NEXT: and v2.16b, v6.16b, v2.16b -; CHECK-NEXT: and v3.16b, v7.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b @@ -269,16 +213,12 @@ ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: movi v2.8b, #127 ; CHECK-NEXT: add v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmge v1.8b, v1.8b, #0 -; CHECK-NEXT: cmge v0.8b, v0.8b, #0 -; CHECK-NEXT: cmge v5.8b, v3.8b, #0 ; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b -; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b +; CHECK-NEXT: cmlt v1.8b, v1.8b, #0 +; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret @@ -311,17 +251,13 @@ ; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: add v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmge v1.4h, v1.4h, #0 -; CHECK-NEXT: cmge v0.4h, v0.4h, #0 -; CHECK-NEXT: cmge v5.4h, v3.4h, #0 ; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h -; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h ; CHECK-NEXT: mvni v2.4h, #128, lsl #8 +; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 +; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h @@ -348,17 +284,13 @@ ; CHECK-NEXT: shl v2.2s, v2.2s, #24 ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: add v3.2s, v0.2s, v2.2s -; CHECK-NEXT: cmge v2.2s, v2.2s, #0 -; CHECK-NEXT: cmge v0.2s, v0.2s, #0 -; CHECK-NEXT: cmge v5.2s, v3.2s, #0 ; CHECK-NEXT: cmlt v4.2s, v3.2s, #0 -; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s -; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s ; CHECK-NEXT: mvni v1.2s, #128, lsl #24 +; CHECK-NEXT: cmlt v2.2s, v2.2s, #0 +; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] @@ -380,16 +312,12 @@ ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-NEXT: add v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmge v1.4h, v1.4h, #0 -; CHECK-NEXT: cmge v0.4h, v0.4h, #0 -; CHECK-NEXT: cmge v5.4h, v3.4h, #0 ; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h -; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h +; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 +; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret @@ -414,17 +342,13 @@ ; CHECK-NEXT: shl v2.2s, v2.2s, #16 ; CHECK-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NEXT: add v3.2s, v0.2s, v2.2s -; CHECK-NEXT: cmge v2.2s, v2.2s, #0 -; CHECK-NEXT: cmge v0.2s, v0.2s, #0 -; CHECK-NEXT: cmge v5.2s, v3.2s, #0 ; CHECK-NEXT: cmlt v4.2s, v3.2s, #0 -; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s -; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s ; CHECK-NEXT: mvni v1.2s, #128, lsl #24 +; CHECK-NEXT: cmlt v2.2s, v2.2s, #0 +; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] @@ -443,17 +367,13 @@ ; CHECK-LABEL: v12i8: ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v2.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v3.16b, #127 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y) @@ -468,27 +388,19 @@ ; CHECK-NEXT: mvni v5.8h, #128, lsl #8 ; CHECK-NEXT: mvni v4.8h, #128, lsl #8 ; CHECK-NEXT: add v6.8h, v1.8h, v2.8h -; CHECK-NEXT: cmlt v16.8h, v6.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v6.8h, #0 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b ; CHECK-NEXT: add v7.8h, v0.8h, v3.8h -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b +; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 +; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h ; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v4.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.8h, v2.8h, #0 -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmge v16.8h, v6.8h, #0 -; CHECK-NEXT: cmge v3.8h, v3.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmeq v2.8h, v1.8h, v2.8h -; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h -; CHECK-NEXT: cmge v16.8h, v7.8h, #0 -; CHECK-NEXT: cmeq v3.8h, v0.8h, v3.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h +; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b +; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b ; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b ; CHECK-NEXT: str q0, [x2] @@ -508,16 +420,12 @@ ; CHECK-NEXT: ldr b1, [x1] ; CHECK-NEXT: movi v2.8b, #127 ; CHECK-NEXT: add v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmge v1.8b, v1.8b, #0 -; CHECK-NEXT: cmge v0.8b, v0.8b, #0 -; CHECK-NEXT: cmge v5.8b, v3.8b, #0 ; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b -; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b +; CHECK-NEXT: cmlt v1.8b, v1.8b, #0 +; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: st1 { v0.b }[0], [x2] ; CHECK-NEXT: ret @@ -535,16 +443,12 @@ ; CHECK-NEXT: ldr h1, [x1] ; CHECK-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-NEXT: add v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmge v1.4h, v1.4h, #0 -; CHECK-NEXT: cmge v0.4h, v0.4h, #0 -; CHECK-NEXT: cmge v5.4h, v3.4h, #0 ; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h -; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h +; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 +; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: str h0, [x2] ; CHECK-NEXT: ret @@ -561,17 +465,13 @@ ; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: add v3.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v3.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v3.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v2.16b, #127 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b ; CHECK-NEXT: sshr v0.16b, v0.16b, #4 ; CHECK-NEXT: ret @@ -585,17 +485,13 @@ ; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: add v3.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v3.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v3.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v2.16b, #127 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b ; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: ret @@ -607,17 +503,13 @@ ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.2s, v0.2s, v1.2s -; CHECK-NEXT: cmge v1.2s, v1.2s, #0 -; CHECK-NEXT: cmge v0.2s, v0.2s, #0 -; CHECK-NEXT: cmge v5.2s, v2.2s, #0 ; CHECK-NEXT: cmlt v4.2s, v2.2s, #0 -; CHECK-NEXT: cmeq v1.2s, v0.2s, v1.2s -; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s ; CHECK-NEXT: mvni v3.2s, #128, lsl #24 +; CHECK-NEXT: cmlt v1.2s, v1.2s, #0 +; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b ; CHECK-NEXT: ret %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) @@ -628,17 +520,13 @@ ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v5.4s, v2.4s, #0 ; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s ; CHECK-NEXT: mvni v3.4s, #128, lsl #24 +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) @@ -649,29 +537,21 @@ ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: add v4.4s, v0.4s, v2.4s -; CHECK-NEXT: cmlt v16.4s, v4.4s, #0 +; CHECK-NEXT: cmlt v7.4s, v4.4s, #0 ; CHECK-NEXT: mvni v6.4s, #128, lsl #24 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b ; CHECK-NEXT: add v7.4s, v1.4s, v3.4s -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b +; CHECK-NEXT: cmlt v2.4s, v2.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s ; CHECK-NEXT: cmlt v16.4s, v7.4s, #0 ; CHECK-NEXT: mvni v5.4s, #128, lsl #24 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.4s, v2.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v16.4s, v4.4s, #0 -; CHECK-NEXT: cmge v3.4s, v3.4s, #0 -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmeq v2.4s, v0.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v16.4s -; CHECK-NEXT: cmge v16.4s, v7.4s, #0 -; CHECK-NEXT: cmeq v3.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v1.4s, v1.4s, v16.4s -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: cmlt v3.4s, v3.4s, #0 +; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b ; CHECK-NEXT: ret @@ -694,42 +574,26 @@ ; CHECK-NEXT: mvn v25.16b, v24.16b ; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.4s, v21.4s, #0 +; CHECK-NEXT: cmlt v4.4s, v4.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s ; CHECK-NEXT: mvni v22.4s, #128, lsl #24 ; CHECK-NEXT: add v23.4s, v3.4s, v7.4s ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmlt v4.4s, v5.4s, #0 +; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s ; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.4s, v23.4s, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmlt v4.4s, v6.4s, #0 +; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s ; CHECK-NEXT: mvni v17.4s, #128, lsl #24 ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmlt v4.4s, v7.4s, #0 +; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s ; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: cmge v4.4s, v4.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v24.4s, v16.4s, #0 -; CHECK-NEXT: cmge v5.4s, v5.4s, #0 -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmeq v4.4s, v0.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v24.4s -; CHECK-NEXT: cmge v24.4s, v19.4s, #0 -; CHECK-NEXT: cmge v6.4s, v6.4s, #0 -; CHECK-NEXT: cmge v2.4s, v2.4s, #0 -; CHECK-NEXT: cmeq v5.4s, v1.4s, v5.4s -; CHECK-NEXT: cmeq v1.4s, v1.4s, v24.4s -; CHECK-NEXT: cmge v24.4s, v21.4s, #0 -; CHECK-NEXT: cmge v7.4s, v7.4s, #0 -; CHECK-NEXT: cmge v3.4s, v3.4s, #0 -; CHECK-NEXT: cmeq v6.4s, v2.4s, v6.4s -; CHECK-NEXT: cmeq v2.4s, v2.4s, v24.4s -; CHECK-NEXT: cmge v24.4s, v23.4s, #0 -; CHECK-NEXT: cmeq v7.4s, v3.4s, v7.4s -; CHECK-NEXT: cmeq v3.4s, v3.4s, v24.4s -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b -; CHECK-NEXT: and v2.16b, v6.16b, v2.16b -; CHECK-NEXT: and v3.16b, v7.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b @@ -743,18 +607,14 @@ ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.2d, v0.2d, v1.2d -; CHECK-NEXT: cmge v1.2d, v1.2d, #0 -; CHECK-NEXT: cmge v0.2d, v0.2d, #0 -; CHECK-NEXT: cmge v5.2d, v2.2d, #0 ; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: cmlt v3.2d, v2.2d, #0 -; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d +; CHECK-NEXT: cmlt v1.2d, v1.2d, #0 ; CHECK-NEXT: dup v4.2d, x8 +; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d ; CHECK-NEXT: mvn v5.16b, v3.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b ; CHECK-NEXT: ret %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y) @@ -766,31 +626,23 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: add v4.2d, v0.2d, v2.2d ; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: cmlt v6.2d, v4.2d, #0 -; CHECK-NEXT: dup v7.2d, x8 +; CHECK-NEXT: cmlt v5.2d, v4.2d, #0 +; CHECK-NEXT: dup v6.2d, x8 +; CHECK-NEXT: mvn v7.16b, v5.16b +; CHECK-NEXT: mov v16.16b, v6.16b +; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b ; CHECK-NEXT: add v5.2d, v1.2d, v3.2d -; CHECK-NEXT: mvn v16.16b, v6.16b -; CHECK-NEXT: mov v17.16b, v7.16b -; CHECK-NEXT: bsl v17.16b, v6.16b, v16.16b -; CHECK-NEXT: cmlt v6.2d, v5.2d, #0 -; CHECK-NEXT: mvn v16.16b, v6.16b -; CHECK-NEXT: bsl v7.16b, v6.16b, v16.16b -; CHECK-NEXT: cmge v2.2d, v2.2d, #0 -; CHECK-NEXT: cmge v0.2d, v0.2d, #0 -; CHECK-NEXT: cmge v6.2d, v4.2d, #0 -; CHECK-NEXT: cmge v3.2d, v3.2d, #0 -; CHECK-NEXT: cmge v1.2d, v1.2d, #0 -; CHECK-NEXT: cmeq v2.2d, v0.2d, v2.2d -; CHECK-NEXT: cmeq v0.2d, v0.2d, v6.2d -; CHECK-NEXT: cmge v6.2d, v5.2d, #0 -; CHECK-NEXT: cmeq v3.2d, v1.2d, v3.2d -; CHECK-NEXT: cmeq v1.2d, v1.2d, v6.2d -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v0.16b, v17.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v7.16b, v5.16b +; CHECK-NEXT: cmlt v2.2d, v2.2d, #0 +; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d +; CHECK-NEXT: cmlt v7.2d, v5.2d, #0 +; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 +; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b +; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b +; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b ; CHECK-NEXT: ret %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -812,42 +664,26 @@ ; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b ; CHECK-NEXT: mvn v20.16b, v22.16b ; CHECK-NEXT: mov v24.16b, v21.16b +; CHECK-NEXT: cmlt v4.2d, v4.2d, #0 +; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d ; CHECK-NEXT: add v19.2d, v3.2d, v7.2d ; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b ; CHECK-NEXT: mvn v20.16b, v23.16b ; CHECK-NEXT: mov v22.16b, v21.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmlt v4.2d, v5.2d, #0 +; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d ; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b ; CHECK-NEXT: cmlt v20.2d, v19.2d, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmlt v4.2d, v6.2d, #0 +; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d ; CHECK-NEXT: mvn v23.16b, v20.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmlt v4.2d, v7.2d, #0 +; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d ; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b -; CHECK-NEXT: cmge v4.2d, v4.2d, #0 -; CHECK-NEXT: cmge v0.2d, v0.2d, #0 -; CHECK-NEXT: cmge v20.2d, v16.2d, #0 -; CHECK-NEXT: cmge v5.2d, v5.2d, #0 -; CHECK-NEXT: cmge v1.2d, v1.2d, #0 -; CHECK-NEXT: cmeq v4.2d, v0.2d, v4.2d -; CHECK-NEXT: cmeq v0.2d, v0.2d, v20.2d -; CHECK-NEXT: cmge v20.2d, v17.2d, #0 -; CHECK-NEXT: cmge v6.2d, v6.2d, #0 -; CHECK-NEXT: cmge v2.2d, v2.2d, #0 -; CHECK-NEXT: cmeq v5.2d, v1.2d, v5.2d -; CHECK-NEXT: cmeq v1.2d, v1.2d, v20.2d -; CHECK-NEXT: cmge v20.2d, v18.2d, #0 -; CHECK-NEXT: cmge v7.2d, v7.2d, #0 -; CHECK-NEXT: cmge v3.2d, v3.2d, #0 -; CHECK-NEXT: cmeq v6.2d, v2.2d, v6.2d -; CHECK-NEXT: cmeq v2.2d, v2.2d, v20.2d -; CHECK-NEXT: cmge v20.2d, v19.2d, #0 -; CHECK-NEXT: cmeq v7.2d, v3.2d, v7.2d -; CHECK-NEXT: cmeq v3.2d, v3.2d, v20.2d -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b -; CHECK-NEXT: and v2.16b, v6.16b, v2.16b -; CHECK-NEXT: and v3.16b, v7.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b Index: llvm/test/CodeGen/AArch64/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/AArch64/ssub_sat.ll +++ llvm/test/CodeGen/AArch64/ssub_sat.ll @@ -54,18 +54,13 @@ ; CHECK-LABEL: vec: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v5.4s, v2.4s, #0 ; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s ; CHECK-NEXT: mvni v3.4s, #128, lsl #24 +; CHECK-NEXT: cmgt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y); Index: llvm/test/CodeGen/AArch64/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -37,18 +37,13 @@ ; CHECK-LABEL: v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v2.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v3.16b, #127 +; CHECK-NEXT: cmgt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y) @@ -59,31 +54,21 @@ ; CHECK-LABEL: v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v4.16b, v0.16b, v2.16b -; CHECK-NEXT: cmlt v16.16b, v4.16b, #0 +; CHECK-NEXT: cmlt v7.16b, v4.16b, #0 ; CHECK-NEXT: movi v6.16b, #127 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b ; CHECK-NEXT: sub v7.16b, v1.16b, v3.16b -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b +; CHECK-NEXT: cmgt v2.16b, v2.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b ; CHECK-NEXT: cmlt v16.16b, v7.16b, #0 ; CHECK-NEXT: movi v5.16b, #127 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.16b, v2.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v16.16b, v4.16b, #0 -; CHECK-NEXT: cmge v3.16b, v3.16b, #0 -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmeq v2.16b, v0.16b, v2.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v16.16b -; CHECK-NEXT: cmge v16.16b, v7.16b, #0 -; CHECK-NEXT: cmeq v3.16b, v1.16b, v3.16b -; CHECK-NEXT: cmeq v1.16b, v1.16b, v16.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: cmgt v3.16b, v3.16b, #0 +; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b ; CHECK-NEXT: ret @@ -106,46 +91,26 @@ ; CHECK-NEXT: mvn v25.16b, v24.16b ; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.16b, v21.16b, #0 +; CHECK-NEXT: cmgt v4.16b, v4.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b ; CHECK-NEXT: movi v22.16b, #127 ; CHECK-NEXT: sub v23.16b, v3.16b, v7.16b ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmgt v4.16b, v5.16b, #0 +; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b ; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.16b, v23.16b, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmgt v4.16b, v6.16b, #0 +; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b ; CHECK-NEXT: movi v17.16b, #127 ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmgt v4.16b, v7.16b, #0 +; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b ; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: cmge v4.16b, v4.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v24.16b, v16.16b, #0 -; CHECK-NEXT: cmge v5.16b, v5.16b, #0 -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v24.16b -; CHECK-NEXT: cmge v24.16b, v19.16b, #0 -; CHECK-NEXT: cmge v6.16b, v6.16b, #0 -; CHECK-NEXT: cmge v2.16b, v2.16b, #0 -; CHECK-NEXT: cmeq v5.16b, v1.16b, v5.16b -; CHECK-NEXT: cmeq v1.16b, v1.16b, v24.16b -; CHECK-NEXT: cmge v24.16b, v21.16b, #0 -; CHECK-NEXT: mvn v4.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: cmge v7.16b, v7.16b, #0 -; CHECK-NEXT: cmge v3.16b, v3.16b, #0 -; CHECK-NEXT: cmeq v6.16b, v2.16b, v6.16b -; CHECK-NEXT: cmeq v2.16b, v2.16b, v24.16b -; CHECK-NEXT: cmge v24.16b, v23.16b, #0 -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: mvn v4.16b, v5.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: cmeq v7.16b, v3.16b, v7.16b -; CHECK-NEXT: cmeq v3.16b, v3.16b, v24.16b -; CHECK-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-NEXT: mvn v4.16b, v6.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-NEXT: mvn v4.16b, v7.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v3.16b, v4.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b @@ -159,18 +124,13 @@ ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v2.8h, v0.8h, v1.8h -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmge v5.8h, v2.8h, #0 ; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: cmeq v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v5.8h ; CHECK-NEXT: mvni v3.8h, #128, lsl #8 +; CHECK-NEXT: cmgt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y) @@ -181,31 +141,21 @@ ; CHECK-LABEL: v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v4.8h, v0.8h, v2.8h -; CHECK-NEXT: cmlt v16.8h, v4.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: mvni v6.8h, #128, lsl #8 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b ; CHECK-NEXT: sub v7.8h, v1.8h, v3.8h -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b +; CHECK-NEXT: cmgt v2.8h, v2.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h ; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 ; CHECK-NEXT: mvni v5.8h, #128, lsl #8 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.8h, v2.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmge v16.8h, v4.8h, #0 -; CHECK-NEXT: cmge v3.8h, v3.8h, #0 -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmeq v2.8h, v0.8h, v2.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h -; CHECK-NEXT: cmge v16.8h, v7.8h, #0 -; CHECK-NEXT: cmeq v3.8h, v1.8h, v3.8h -; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: cmgt v3.8h, v3.8h, #0 +; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b ; CHECK-NEXT: ret @@ -228,46 +178,26 @@ ; CHECK-NEXT: mvn v25.16b, v24.16b ; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.8h, v21.8h, #0 +; CHECK-NEXT: cmgt v4.8h, v4.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h ; CHECK-NEXT: mvni v22.8h, #128, lsl #8 ; CHECK-NEXT: sub v23.8h, v3.8h, v7.8h ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmgt v4.8h, v5.8h, #0 +; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h ; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.8h, v23.8h, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmgt v4.8h, v6.8h, #0 +; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h ; CHECK-NEXT: mvni v17.8h, #128, lsl #8 ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmgt v4.8h, v7.8h, #0 +; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h ; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: cmge v4.8h, v4.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmge v24.8h, v16.8h, #0 -; CHECK-NEXT: cmge v5.8h, v5.8h, #0 -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmeq v4.8h, v0.8h, v4.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v24.8h -; CHECK-NEXT: cmge v24.8h, v19.8h, #0 -; CHECK-NEXT: cmge v6.8h, v6.8h, #0 -; CHECK-NEXT: cmge v2.8h, v2.8h, #0 -; CHECK-NEXT: cmeq v5.8h, v1.8h, v5.8h -; CHECK-NEXT: cmeq v1.8h, v1.8h, v24.8h -; CHECK-NEXT: cmge v24.8h, v21.8h, #0 -; CHECK-NEXT: mvn v4.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: cmge v7.8h, v7.8h, #0 -; CHECK-NEXT: cmge v3.8h, v3.8h, #0 -; CHECK-NEXT: cmeq v6.8h, v2.8h, v6.8h -; CHECK-NEXT: cmeq v2.8h, v2.8h, v24.8h -; CHECK-NEXT: cmge v24.8h, v23.8h, #0 -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: mvn v4.16b, v5.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: cmeq v7.8h, v3.8h, v7.8h -; CHECK-NEXT: cmeq v3.8h, v3.8h, v24.8h -; CHECK-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-NEXT: mvn v4.16b, v6.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-NEXT: mvn v4.16b, v7.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v3.16b, v4.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b @@ -284,17 +214,12 @@ ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: movi v2.8b, #127 ; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmge v1.8b, v1.8b, #0 -; CHECK-NEXT: cmge v0.8b, v0.8b, #0 -; CHECK-NEXT: cmge v5.8b, v3.8b, #0 ; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b -; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b +; CHECK-NEXT: cmgt v1.8b, v1.8b, #0 +; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v1.8b, v1.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret @@ -327,18 +252,13 @@ ; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmge v1.4h, v1.4h, #0 -; CHECK-NEXT: cmge v0.4h, v0.4h, #0 -; CHECK-NEXT: cmge v5.4h, v3.4h, #0 ; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h -; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h ; CHECK-NEXT: mvni v2.4h, #128, lsl #8 +; CHECK-NEXT: cmgt v1.4h, v1.4h, #0 +; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v1.8b, v1.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h @@ -365,18 +285,13 @@ ; CHECK-NEXT: shl v2.2s, v2.2s, #24 ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: sub v3.2s, v0.2s, v2.2s -; CHECK-NEXT: cmge v2.2s, v2.2s, #0 -; CHECK-NEXT: cmge v0.2s, v0.2s, #0 -; CHECK-NEXT: cmge v5.2s, v3.2s, #0 ; CHECK-NEXT: cmlt v4.2s, v3.2s, #0 -; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s -; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s ; CHECK-NEXT: mvni v1.2s, #128, lsl #24 +; CHECK-NEXT: cmgt v2.2s, v2.2s, #0 +; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v2.8b, v2.8b -; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] @@ -398,17 +313,12 @@ ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmge v1.4h, v1.4h, #0 -; CHECK-NEXT: cmge v0.4h, v0.4h, #0 -; CHECK-NEXT: cmge v5.4h, v3.4h, #0 ; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h -; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h +; CHECK-NEXT: cmgt v1.4h, v1.4h, #0 +; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v1.8b, v1.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: str d0, [x2] ; CHECK-NEXT: ret @@ -433,18 +343,13 @@ ; CHECK-NEXT: shl v2.2s, v2.2s, #16 ; CHECK-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NEXT: sub v3.2s, v0.2s, v2.2s -; CHECK-NEXT: cmge v2.2s, v2.2s, #0 -; CHECK-NEXT: cmge v0.2s, v0.2s, #0 -; CHECK-NEXT: cmge v5.2s, v3.2s, #0 ; CHECK-NEXT: cmlt v4.2s, v3.2s, #0 -; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s -; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s ; CHECK-NEXT: mvni v1.2s, #128, lsl #24 +; CHECK-NEXT: cmgt v2.2s, v2.2s, #0 +; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v2.8b, v2.8b -; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v2.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] @@ -463,18 +368,13 @@ ; CHECK-LABEL: v12i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v2.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v2.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v3.16b, #127 +; CHECK-NEXT: cmgt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y) @@ -489,29 +389,19 @@ ; CHECK-NEXT: mvni v5.8h, #128, lsl #8 ; CHECK-NEXT: mvni v4.8h, #128, lsl #8 ; CHECK-NEXT: sub v6.8h, v1.8h, v2.8h -; CHECK-NEXT: cmlt v16.8h, v6.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v6.8h, #0 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b ; CHECK-NEXT: sub v7.8h, v0.8h, v3.8h -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b +; CHECK-NEXT: cmgt v2.8h, v2.8h, #0 +; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h ; CHECK-NEXT: cmlt v16.8h, v7.8h, #0 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v4.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.8h, v2.8h, #0 -; CHECK-NEXT: cmge v1.8h, v1.8h, #0 -; CHECK-NEXT: cmge v16.8h, v6.8h, #0 -; CHECK-NEXT: cmge v3.8h, v3.8h, #0 -; CHECK-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-NEXT: cmeq v2.8h, v1.8h, v2.8h -; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h -; CHECK-NEXT: cmge v16.8h, v7.8h, #0 -; CHECK-NEXT: cmeq v3.8h, v0.8h, v3.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-NEXT: cmgt v3.8h, v3.8h, #0 +; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h +; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b +; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b ; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b ; CHECK-NEXT: str q0, [x2] @@ -531,17 +421,12 @@ ; CHECK-NEXT: ldr b1, [x1] ; CHECK-NEXT: movi v2.8b, #127 ; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b -; CHECK-NEXT: cmge v1.8b, v1.8b, #0 -; CHECK-NEXT: cmge v0.8b, v0.8b, #0 -; CHECK-NEXT: cmge v5.8b, v3.8b, #0 ; CHECK-NEXT: cmlt v4.8b, v3.8b, #0 -; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b -; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b +; CHECK-NEXT: cmgt v1.8b, v1.8b, #0 +; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v1.8b, v1.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: st1 { v0.b }[0], [x2] ; CHECK-NEXT: ret @@ -559,17 +444,12 @@ ; CHECK-NEXT: ldr h1, [x1] ; CHECK-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h -; CHECK-NEXT: cmge v1.4h, v1.4h, #0 -; CHECK-NEXT: cmge v0.4h, v0.4h, #0 -; CHECK-NEXT: cmge v5.4h, v3.4h, #0 ; CHECK-NEXT: cmlt v4.4h, v3.4h, #0 -; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h -; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h +; CHECK-NEXT: cmgt v1.4h, v1.4h, #0 +; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v1.8b, v1.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: str h0, [x2] ; CHECK-NEXT: ret @@ -586,18 +466,13 @@ ; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: sub v3.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v3.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v3.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v2.16b, #127 +; CHECK-NEXT: cmgt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b ; CHECK-NEXT: sshr v0.16b, v0.16b, #4 ; CHECK-NEXT: ret @@ -611,18 +486,13 @@ ; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: sub v3.16b, v0.16b, v1.16b -; CHECK-NEXT: cmge v1.16b, v1.16b, #0 -; CHECK-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-NEXT: cmge v5.16b, v3.16b, #0 ; CHECK-NEXT: cmlt v4.16b, v3.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b ; CHECK-NEXT: movi v2.16b, #127 +; CHECK-NEXT: cmgt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b ; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: ret @@ -634,18 +504,13 @@ ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v2.2s, v0.2s, v1.2s -; CHECK-NEXT: cmge v1.2s, v1.2s, #0 -; CHECK-NEXT: cmge v0.2s, v0.2s, #0 -; CHECK-NEXT: cmge v5.2s, v2.2s, #0 ; CHECK-NEXT: cmlt v4.2s, v2.2s, #0 -; CHECK-NEXT: cmeq v1.2s, v0.2s, v1.2s -; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s ; CHECK-NEXT: mvni v3.2s, #128, lsl #24 +; CHECK-NEXT: cmgt v1.2s, v1.2s, #0 +; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s ; CHECK-NEXT: mvn v5.8b, v4.8b -; CHECK-NEXT: mvn v1.8b, v1.8b -; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b -; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b ; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b ; CHECK-NEXT: ret %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) @@ -656,18 +521,13 @@ ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v5.4s, v2.4s, #0 ; CHECK-NEXT: cmlt v4.4s, v2.4s, #0 -; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s ; CHECK-NEXT: mvni v3.4s, #128, lsl #24 +; CHECK-NEXT: cmgt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s ; CHECK-NEXT: mvn v5.16b, v4.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b ; CHECK-NEXT: ret %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) @@ -678,31 +538,21 @@ ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v4.4s, v0.4s, v2.4s -; CHECK-NEXT: cmlt v16.4s, v4.4s, #0 +; CHECK-NEXT: cmlt v7.4s, v4.4s, #0 ; CHECK-NEXT: mvni v6.4s, #128, lsl #24 +; CHECK-NEXT: mvn v16.16b, v7.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b ; CHECK-NEXT: sub v7.4s, v1.4s, v3.4s -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b +; CHECK-NEXT: cmgt v2.4s, v2.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s ; CHECK-NEXT: cmlt v16.4s, v7.4s, #0 ; CHECK-NEXT: mvni v5.4s, #128, lsl #24 -; CHECK-NEXT: mvn v17.16b, v16.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b -; CHECK-NEXT: cmge v2.4s, v2.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v16.4s, v4.4s, #0 -; CHECK-NEXT: cmge v3.4s, v3.4s, #0 -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmeq v2.4s, v0.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v16.4s -; CHECK-NEXT: cmge v16.4s, v7.4s, #0 -; CHECK-NEXT: cmeq v3.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v1.4s, v1.4s, v16.4s -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: cmgt v3.4s, v3.4s, #0 +; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v16.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b ; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b ; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b ; CHECK-NEXT: ret @@ -725,46 +575,26 @@ ; CHECK-NEXT: mvn v25.16b, v24.16b ; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.4s, v21.4s, #0 +; CHECK-NEXT: cmgt v4.4s, v4.4s, #0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s ; CHECK-NEXT: mvni v22.4s, #128, lsl #24 ; CHECK-NEXT: sub v23.4s, v3.4s, v7.4s ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmgt v4.4s, v5.4s, #0 +; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s ; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b ; CHECK-NEXT: cmlt v24.4s, v23.4s, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmgt v4.4s, v6.4s, #0 +; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s ; CHECK-NEXT: mvni v17.4s, #128, lsl #24 ; CHECK-NEXT: mvn v25.16b, v24.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmgt v4.4s, v7.4s, #0 +; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s ; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b -; CHECK-NEXT: cmge v4.4s, v4.4s, #0 -; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: cmge v24.4s, v16.4s, #0 -; CHECK-NEXT: cmge v5.4s, v5.4s, #0 -; CHECK-NEXT: cmge v1.4s, v1.4s, #0 -; CHECK-NEXT: cmeq v4.4s, v0.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v24.4s -; CHECK-NEXT: cmge v24.4s, v19.4s, #0 -; CHECK-NEXT: cmge v6.4s, v6.4s, #0 -; CHECK-NEXT: cmge v2.4s, v2.4s, #0 -; CHECK-NEXT: cmeq v5.4s, v1.4s, v5.4s -; CHECK-NEXT: cmeq v1.4s, v1.4s, v24.4s -; CHECK-NEXT: cmge v24.4s, v21.4s, #0 -; CHECK-NEXT: mvn v4.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: cmge v7.4s, v7.4s, #0 -; CHECK-NEXT: cmge v3.4s, v3.4s, #0 -; CHECK-NEXT: cmeq v6.4s, v2.4s, v6.4s -; CHECK-NEXT: cmeq v2.4s, v2.4s, v24.4s -; CHECK-NEXT: cmge v24.4s, v23.4s, #0 -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: mvn v4.16b, v5.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: cmeq v7.4s, v3.4s, v7.4s -; CHECK-NEXT: cmeq v3.4s, v3.4s, v24.4s -; CHECK-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-NEXT: mvn v4.16b, v6.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-NEXT: mvn v4.16b, v7.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v3.16b, v4.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b @@ -778,19 +608,14 @@ ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v2.2d, v0.2d, v1.2d -; CHECK-NEXT: cmge v1.2d, v1.2d, #0 -; CHECK-NEXT: cmge v0.2d, v0.2d, #0 -; CHECK-NEXT: cmge v5.2d, v2.2d, #0 ; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: cmlt v3.2d, v2.2d, #0 -; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d +; CHECK-NEXT: cmgt v1.2d, v1.2d, #0 ; CHECK-NEXT: dup v4.2d, x8 +; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d ; CHECK-NEXT: mvn v5.16b, v3.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b ; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b ; CHECK-NEXT: ret %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) @@ -802,33 +627,23 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub v4.2d, v0.2d, v2.2d ; CHECK-NEXT: mov x8, #9223372036854775807 -; CHECK-NEXT: cmlt v6.2d, v4.2d, #0 -; CHECK-NEXT: dup v7.2d, x8 +; CHECK-NEXT: cmlt v5.2d, v4.2d, #0 +; CHECK-NEXT: dup v6.2d, x8 +; CHECK-NEXT: mvn v7.16b, v5.16b +; CHECK-NEXT: mov v16.16b, v6.16b +; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b ; CHECK-NEXT: sub v5.2d, v1.2d, v3.2d -; CHECK-NEXT: mvn v16.16b, v6.16b -; CHECK-NEXT: mov v17.16b, v7.16b -; CHECK-NEXT: bsl v17.16b, v6.16b, v16.16b -; CHECK-NEXT: cmlt v6.2d, v5.2d, #0 -; CHECK-NEXT: mvn v16.16b, v6.16b -; CHECK-NEXT: bsl v7.16b, v6.16b, v16.16b -; CHECK-NEXT: cmge v2.2d, v2.2d, #0 -; CHECK-NEXT: cmge v0.2d, v0.2d, #0 -; CHECK-NEXT: cmge v6.2d, v4.2d, #0 -; CHECK-NEXT: cmge v3.2d, v3.2d, #0 -; CHECK-NEXT: cmge v1.2d, v1.2d, #0 -; CHECK-NEXT: cmeq v2.2d, v0.2d, v2.2d -; CHECK-NEXT: cmeq v0.2d, v0.2d, v6.2d -; CHECK-NEXT: cmge v6.2d, v5.2d, #0 -; CHECK-NEXT: cmeq v3.2d, v1.2d, v3.2d -; CHECK-NEXT: cmeq v1.2d, v1.2d, v6.2d -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: bsl v0.16b, v17.16b, v4.16b -; CHECK-NEXT: bsl v1.16b, v7.16b, v5.16b +; CHECK-NEXT: cmgt v2.2d, v2.2d, #0 +; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d +; CHECK-NEXT: cmlt v7.2d, v5.2d, #0 +; CHECK-NEXT: cmgt v3.2d, v3.2d, #0 +; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v2.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b +; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b +; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b ; CHECK-NEXT: ret %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -850,46 +665,26 @@ ; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b ; CHECK-NEXT: mvn v20.16b, v22.16b ; CHECK-NEXT: mov v24.16b, v21.16b +; CHECK-NEXT: cmgt v4.2d, v4.2d, #0 +; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d ; CHECK-NEXT: sub v19.2d, v3.2d, v7.2d ; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b ; CHECK-NEXT: mvn v20.16b, v23.16b ; CHECK-NEXT: mov v22.16b, v21.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: cmgt v4.2d, v5.2d, #0 +; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d ; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b ; CHECK-NEXT: cmlt v20.2d, v19.2d, #0 +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmgt v4.2d, v6.2d, #0 +; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d ; CHECK-NEXT: mvn v23.16b, v20.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: cmgt v4.2d, v7.2d, #0 +; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d ; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b -; CHECK-NEXT: cmge v4.2d, v4.2d, #0 -; CHECK-NEXT: cmge v0.2d, v0.2d, #0 -; CHECK-NEXT: cmge v20.2d, v16.2d, #0 -; CHECK-NEXT: cmge v5.2d, v5.2d, #0 -; CHECK-NEXT: cmge v1.2d, v1.2d, #0 -; CHECK-NEXT: cmeq v4.2d, v0.2d, v4.2d -; CHECK-NEXT: cmeq v0.2d, v0.2d, v20.2d -; CHECK-NEXT: cmge v20.2d, v17.2d, #0 -; CHECK-NEXT: cmge v6.2d, v6.2d, #0 -; CHECK-NEXT: cmge v2.2d, v2.2d, #0 -; CHECK-NEXT: cmeq v5.2d, v1.2d, v5.2d -; CHECK-NEXT: cmeq v1.2d, v1.2d, v20.2d -; CHECK-NEXT: cmge v20.2d, v18.2d, #0 -; CHECK-NEXT: mvn v4.16b, v4.16b -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: cmge v7.2d, v7.2d, #0 -; CHECK-NEXT: cmge v3.2d, v3.2d, #0 -; CHECK-NEXT: cmeq v6.2d, v2.2d, v6.2d -; CHECK-NEXT: cmeq v2.2d, v2.2d, v20.2d -; CHECK-NEXT: cmge v20.2d, v19.2d, #0 -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: mvn v4.16b, v5.16b -; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: cmeq v7.2d, v3.2d, v7.2d -; CHECK-NEXT: cmeq v3.2d, v3.2d, v20.2d -; CHECK-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-NEXT: mvn v4.16b, v6.16b -; CHECK-NEXT: mvn v2.16b, v2.16b -; CHECK-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-NEXT: mvn v4.16b, v7.16b -; CHECK-NEXT: mvn v3.16b, v3.16b -; CHECK-NEXT: and v3.16b, v4.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b ; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b ; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b Index: llvm/test/CodeGen/AMDGPU/saddo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/saddo.ll +++ llvm/test/CodeGen/AMDGPU/saddo.ll @@ -13,29 +13,25 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: saddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; SI-NEXT: s_add_u32 s2, s10, s0 -; SI-NEXT: s_addc_u32 s3, s11, s1 -; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_add_u32 s10, s6, s8 +; SI-NEXT: s_addc_u32 s11, s7, s9 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] +; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: saddo_i64_zext: @@ -43,22 +39,18 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: s_add_u32 s8, s6, s0 +; VI-NEXT: s_addc_u32 s9, s7, s1 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2] +; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] -; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1 -; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] -; VI-NEXT: s_add_u32 s2, s6, s0 -; VI-NEXT: s_addc_u32 s3, s7, s1 -; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -68,22 +60,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_add_u32 s8, s6, s0 +; GFX9-NEXT: s_addc_u32 s9, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] -; GFX9-NEXT: s_add_u32 s2, s6, s0 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2 -; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm @@ -99,32 +87,27 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; SI-NEXT: s_add_i32 s2, s0, s1 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: v_cmp_lt_i32_e64 s[10:11], s9, 0 +; SI-NEXT: s_add_i32 s9, s8, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, s9, v0 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_xor_b64 s[0:1], s[10:11], vcc +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_saddo_i32: @@ -133,18 +116,13 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3] -; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; VI-NEXT: s_add_i32 s2, s0, s1 -; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 +; VI-NEXT: s_add_i32 s1, s0, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 +; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -158,18 +136,13 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3] -; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; GFX9-NEXT: s_add_i32 s2, s0, s1 -; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 +; GFX9-NEXT: s_add_i32 s1, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 @@ -204,19 +177,12 @@ ; SI-NEXT: s_mov_b32 s6, s14 ; SI-NEXT: s_mov_b32 s7, s15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v1 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 ; SI-NEXT: s_endpgm ; @@ -235,17 +201,11 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6 -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v6 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6 +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: flat_store_dword v[2:3], v5 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm @@ -265,17 +225,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5 -; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: global_store_dword v[2:3], v4, off +; GFX9-NEXT: v_add_u32_e32 v5, v6, v4 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: global_store_dword v[2:3], v5, off ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -292,31 +246,27 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], -1 -; SI-NEXT: s_add_u32 s2, s8, s10 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[8:9], -1 -; SI-NEXT: s_addc_u32 s3, s9, s11 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s6, s14 -; SI-NEXT: s_mov_b32 s7, s15 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_add_u32 s12, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_addc_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_xor_b64 s[0:1], s[4:5], vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_saddo_i64: @@ -324,22 +274,18 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 +; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_addc_u32 s3, s5, s7 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -350,22 +296,18 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s0, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 +; GFX9-NEXT: s_addc_u32 s1, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; GFX9-NEXT: s_add_u32 s2, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_addc_u32 s3, s5, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -398,19 +340,12 @@ ; SI-NEXT: s_mov_b32 s6, s14 ; SI-NEXT: s_mov_b32 s7, s15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3] -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[0:1] -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v2 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc +; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 ; SI-NEXT: s_endpgm @@ -430,18 +365,12 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7] -; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5] -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4 +; VI-NEXT: v_addc_u32_e32 v9, vcc, v7, v5, vcc +; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9] +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm @@ -461,18 +390,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6 -; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v5, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7] +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -489,48 +412,35 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { ; SI-LABEL: v_saddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s19, 0xf000 +; SI-NEXT: s_mov_b32 s18, -1 +; SI-NEXT: s_mov_b32 s2, s18 +; SI-NEXT: s_mov_b32 s3, s19 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s14 +; SI-NEXT: s_mov_b32 s1, s15 +; SI-NEXT: s_mov_b32 s14, s18 +; SI-NEXT: s_mov_b32 s15, s19 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s6, s14 -; SI-NEXT: s_mov_b32 s7, s15 +; SI-NEXT: s_mov_b32 s16, s10 +; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_mov_b32 s10, s18 +; SI-NEXT: s_mov_b32 s11, s19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0 -; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v1 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, v2 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], v6, v2 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], v5, v3 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_and_b64 s[0:1], vcc, s[2:3] +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_saddo_v2i32: @@ -543,33 +453,21 @@ ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6 -; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5 -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8 -; VI-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7 -; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; VI-NEXT: v_add_u32_e32 v9, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4 +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9] ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] -; VI-NEXT: s_and_b64 s[0:1], vcc, s[2:3] +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -584,33 +482,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5 -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off +; GFX9-NEXT: v_add_u32_e32 v9, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v8, v6, v4 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6 +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] -; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[2:3] +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm Index: llvm/test/CodeGen/ARM/addsubo-legalization.ll =================================================================== --- llvm/test/CodeGen/ARM/addsubo-legalization.ll +++ llvm/test/CodeGen/ARM/addsubo-legalization.ll @@ -95,76 +95,48 @@ define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) { ; CHECK-LABEL: saddo: ; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vmov.32 r1, d16[1] -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vmov.32 r2, d17[1] -; CHECK-NEXT: vadd.i64 q8, q9, q8 -; CHECK-NEXT: vmov.32 r12, d18[1] -; CHECK-NEXT: vmov.32 r4, d19[1] -; CHECK-NEXT: vmov.32 lr, d16[1] -; CHECK-NEXT: vmov.32 r7, d17[1] -; CHECK-NEXT: cmp.w r1, #-1 +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vadd.i64 q8, q10, q9 +; CHECK-NEXT: vmov.32 r2, d20[0] +; CHECK-NEXT: vmov.32 r1, d20[1] +; CHECK-NEXT: vmov.32 r12, d16[0] +; CHECK-NEXT: vmov.32 r8, d16[1] +; CHECK-NEXT: vmov.32 lr, d17[0] +; CHECK-NEXT: vmov.32 r4, d21[0] +; CHECK-NEXT: vmov.32 r5, d17[1] +; CHECK-NEXT: vmov.32 r6, d18[1] +; CHECK-NEXT: vmov.32 r7, d21[1] +; CHECK-NEXT: subs.w r2, r12, r2 +; CHECK-NEXT: vmov.32 r2, d19[1] +; CHECK-NEXT: sbcs.w r1, r8, r1 ; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: cmp.w r2, #-1 -; CHECK-NEXT: mov.w r2, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r2, #1 -; CHECK-NEXT: cmp.w r12, #-1 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-1 -; CHECK-NEXT: cmp.w r4, #-1 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r4, #1 -; CHECK-NEXT: cmp.w lr, #-1 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r6, #1 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r6, #-1 -; CHECK-NEXT: cmp.w r7, #-1 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: subs.w r4, lr, r4 +; CHECK-NEXT: sbcs.w r7, r5, r7 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r3, #-1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vdup.32 d19, r3 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: asrs r7, r6, #31 +; CHECK-NEXT: vdup.32 d21, r3 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: vdup.32 d23, r2 -; CHECK-NEXT: vdup.32 d21, r4 -; CHECK-NEXT: vdup.32 d18, r6 -; CHECK-NEXT: vdup.32 d22, r1 -; CHECK-NEXT: vdup.32 d20, r5 -; CHECK-NEXT: vceq.i32 q9, q10, q9 +; CHECK-NEXT: movne.w r1, #-1 +; CHECK-NEXT: vdup.32 d20, r1 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0] -; CHECK-NEXT: vceq.i32 q10, q10, q11 -; CHECK-NEXT: vrev64.32 q11, q9 -; CHECK-NEXT: vrev64.32 q12, q10 -; CHECK-NEXT: vand q9, q9, q11 -; CHECK-NEXT: vand q10, q10, q12 -; CHECK-NEXT: vbic q9, q10, q9 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vdup.32 d19, r2 +; CHECK-NEXT: vdup.32 d18, r7 +; CHECK-NEXT: veor q9, q9, q10 ; CHECK-NEXT: vmovn.i64 d18, q9 ; CHECK-NEXT: vmov r2, r1, d18 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} %x = load <2 x i64>, <2 x i64>* %ptr, align 8 %y = load <2 x i64>, <2 x i64>* %ptr2, align 8 %s = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y) @@ -177,77 +149,64 @@ define <2 x i1> @ssubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) { ; CHECK-LABEL: ssubo: ; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] ; CHECK-NEXT: vsub.i64 q8, q10, q9 +; CHECK-NEXT: vmov.32 r1, d20[0] ; CHECK-NEXT: vmov.32 r12, d20[1] -; CHECK-NEXT: vmov.32 lr, d21[1] -; CHECK-NEXT: vmov.32 r1, d16[1] -; CHECK-NEXT: vmov.32 r2, d17[1] -; CHECK-NEXT: vmov.32 r4, d18[1] -; CHECK-NEXT: vmov.32 r7, d19[1] -; CHECK-NEXT: cmp.w r1, #-1 +; CHECK-NEXT: vmov.32 r3, d16[0] +; CHECK-NEXT: vmov.32 lr, d16[1] +; CHECK-NEXT: vmov.32 r4, d21[0] +; CHECK-NEXT: vmov.32 r5, d17[0] +; CHECK-NEXT: vmov.32 r6, d21[1] +; CHECK-NEXT: vmov.32 r7, d17[1] +; CHECK-NEXT: vmov.32 r8, d18[1] +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: vmov.32 r3, d18[0] +; CHECK-NEXT: sbcs.w r1, lr, r12 +; CHECK-NEXT: vmov.32 r12, d19[0] ; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: cmp.w r2, #-1 -; CHECK-NEXT: mov.w r2, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r2, #1 -; CHECK-NEXT: cmp.w r12, #-1 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-1 -; CHECK-NEXT: cmp.w lr, #-1 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r6, #1 -; CHECK-NEXT: cmp.w r4, #-1 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r4, #1 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: subs r5, r5, r4 +; CHECK-NEXT: vmov.32 r5, d19[1] +; CHECK-NEXT: sbcs r7, r6 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r7, #1 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r7, #-1 +; CHECK-NEXT: vdup.32 d21, r7 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: sbcs.w r3, r2, r8 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: rsbs.w r6, r12, #0 +; CHECK-NEXT: sbcs.w r6, r2, r5 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: cmp.w r7, #-1 -; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r3, #1 +; CHECK-NEXT: movne.w r2, #-1 ; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: vdup.32 d19, r2 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r3, #-1 -; CHECK-NEXT: vdup.32 d19, r3 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r6, #-1 -; CHECK-NEXT: vdup.32 d21, r6 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vdup.32 d18, r4 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: vdup.32 d23, r2 -; CHECK-NEXT: vdup.32 d20, r5 -; CHECK-NEXT: vdup.32 d22, r1 -; CHECK-NEXT: vceq.i32 q9, q10, q9 +; CHECK-NEXT: movne.w r1, #-1 +; CHECK-NEXT: vdup.32 d18, r3 +; CHECK-NEXT: vdup.32 d20, r1 +; CHECK-NEXT: veor q9, q9, q10 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0] -; CHECK-NEXT: vceq.i32 q10, q10, q11 -; CHECK-NEXT: vrev64.32 q11, q9 -; CHECK-NEXT: vrev64.32 q12, q10 -; CHECK-NEXT: vand q9, q9, q11 -; CHECK-NEXT: vand q10, q10, q12 -; CHECK-NEXT: vmvn q9, q9 -; CHECK-NEXT: vbic q9, q9, q10 ; CHECK-NEXT: vmovn.i64 d18, q9 ; CHECK-NEXT: vmov r2, r1, d18 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} %x = load <2 x i64>, <2 x i64>* %ptr, align 8 %y = load <2 x i64>, <2 x i64>* %ptr2, align 8 %s = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y) Index: llvm/test/CodeGen/RISCV/arith-with-overflow.ll =================================================================== --- llvm/test/CodeGen/RISCV/arith-with-overflow.ll +++ llvm/test/CodeGen/RISCV/arith-with-overflow.ll @@ -12,15 +12,9 @@ ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: add a3, a0, a1 ; RV32I-NEXT: sw a3, 0(a2) -; RV32I-NEXT: addi a2, zero, -1 -; RV32I-NEXT: slt a1, a2, a1 -; RV32I-NEXT: slt a0, a2, a0 -; RV32I-NEXT: slt a2, a2, a3 -; RV32I-NEXT: xor a2, a0, a2 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: seqz a0, a0 -; RV32I-NEXT: snez a1, a2 -; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slt a0, a3, a0 +; RV32I-NEXT: slti a1, a1, 0 +; RV32I-NEXT: xor a0, a1, a0 ; RV32I-NEXT: ret entry: %x = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) @@ -30,20 +24,43 @@ ret i1 %ovf } +; Show that the xor is folded in the branch +define void @sadd_and_branch(i32 %a, i32 %b, void ()* %pf) nounwind { +; RV32I-LABEL: sadd_and_branch: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: add a3, a0, a1 +; RV32I-NEXT: slt a0, a3, a0 +; RV32I-NEXT: slti a1, a1, 0 +; RV32I-NEXT: beq a1, a0, .LBB1_2 +; RV32I-NEXT: # %bb.1: # %b.then +; RV32I-NEXT: jalr a2 +; RV32I-NEXT: .LBB1_2: # %b.end +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +entry: + %x = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) + %calc = extractvalue {i32, i1} %x, 0 + %ovf = extractvalue {i32, i1} %x, 1 + br i1 %ovf, label %b.then, label %b.end + +b.then: + call void %pf() + br label %b.end +b.end: + ret void +} + define i1 @ssub(i32 %a, i32 %b, i32* %c) nounwind { ; RV32I-LABEL: ssub: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: sub a3, a0, a1 ; RV32I-NEXT: sw a3, 0(a2) -; RV32I-NEXT: addi a2, zero, -1 -; RV32I-NEXT: slt a1, a2, a1 -; RV32I-NEXT: slt a0, a2, a0 -; RV32I-NEXT: slt a2, a2, a3 -; RV32I-NEXT: xor a2, a0, a2 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: snez a0, a0 -; RV32I-NEXT: snez a1, a2 -; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: sgtz a1, a1 +; RV32I-NEXT: slt a0, a3, a0 +; RV32I-NEXT: xor a0, a1, a0 ; RV32I-NEXT: ret entry: %x = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) @@ -53,6 +70,35 @@ ret i1 %ovf } +; Show that the xor is folded in the branch +define void @ssub_and_branch(i32 %a, i32 %b, void ()* %pf) nounwind { +; RV32I-LABEL: ssub_and_branch: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sgtz a3, a1 +; RV32I-NEXT: sub a1, a0, a1 +; RV32I-NEXT: slt a0, a1, a0 +; RV32I-NEXT: beq a3, a0, .LBB3_2 +; RV32I-NEXT: # %bb.1: # %b.then +; RV32I-NEXT: jalr a2 +; RV32I-NEXT: .LBB3_2: # %b.end +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +entry: + %x = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) + %calc = extractvalue {i32, i1} %x, 0 + %ovf = extractvalue {i32, i1} %x, 1 + br i1 %ovf, label %b.then, label %b.end + +b.then: + call void %pf() + br label %b.end +b.end: + ret void +} + define i1 @uadd(i32 %a, i32 %b, i32* %c) nounwind { ; RV32I-LABEL: uadd: ; RV32I: # %bb.0: # %entry Index: llvm/test/CodeGen/X86/combine-mulo.ll =================================================================== --- llvm/test/CodeGen/X86/combine-mulo.ll +++ llvm/test/CodeGen/X86/combine-mulo.ll @@ -34,30 +34,21 @@ ; SSE-LABEL: combine_vec_smul_two: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: paddd %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: paddd %xmm2, %xmm2 -; SSE-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_smul_two: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq %1 = call {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> ) %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0 Index: llvm/test/CodeGen/X86/mulo-pow2.ll =================================================================== --- llvm/test/CodeGen/X86/mulo-pow2.ll +++ llvm/test/CodeGen/X86/mulo-pow2.ll @@ -98,15 +98,10 @@ define <4 x i32> @smul_v4i32_2(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX-LABEL: smul_v4i32_2: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq %x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> ) %y = extractvalue { <4 x i32>, <4 x i1> } %x, 0 Index: llvm/test/CodeGen/X86/sadd_sat.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat.ll +++ llvm/test/CodeGen/X86/sadd_sat.ll @@ -183,28 +183,20 @@ ; ; X64-LABEL: vec: ; X64: # %bb.0: +; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtd %xmm1, %xmm4 -; X64-NEXT: pcmpeqd %xmm2, %xmm2 -; X64-NEXT: pxor %xmm2, %xmm4 -; X64-NEXT: pxor %xmm5, %xmm5 -; X64-NEXT: pcmpgtd %xmm0, %xmm5 -; X64-NEXT: pxor %xmm2, %xmm5 -; X64-NEXT: pcmpeqd %xmm5, %xmm4 -; X64-NEXT: paddd %xmm1, %xmm0 -; X64-NEXT: pcmpgtd %xmm0, %xmm3 -; X64-NEXT: pxor %xmm3, %xmm2 -; X64-NEXT: pcmpeqd %xmm5, %xmm2 -; X64-NEXT: pandn %xmm4, %xmm2 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: pandn {{.*}}(%rip), %xmm1 -; X64-NEXT: psrld $1, %xmm3 -; X64-NEXT: por %xmm1, %xmm3 -; X64-NEXT: pand %xmm2, %xmm3 -; X64-NEXT: pandn %xmm0, %xmm2 +; X64-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: pxor %xmm3, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: pandn {{.*}}(%rip), %xmm3 +; X64-NEXT: psrld $1, %xmm2 ; X64-NEXT: por %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: pandn %xmm1, %xmm0 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y); ret <4 x i32> %tmp; Index: llvm/test/CodeGen/X86/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -569,165 +569,135 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: psllq $32, %xmm1 ; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: psllq $32, %xmm1 ; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm6, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm2 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm0, %xmm4 ; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: psllq $32, %xmm1 ; SSE41-NEXT: paddq %xmm1, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: pandn %xmm4, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -741,22 +711,13 @@ ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -765,22 +726,13 @@ ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] @@ -790,19 +742,16 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0 ; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} -; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vpsraq $32, %xmm1, %xmm0 ; AVX512-NEXT: retq %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z @@ -811,133 +760,88 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: v4i32: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSE2-NEXT: psrld $1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: # %bb.0: +; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm2, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: paddd %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: psrld $1, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSSE3-NEXT: psrld $1, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: paddd %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE41-NEXT: pandn %xmm4, %xmm3 -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: paddd %xmm1, %xmm3 +; SSE41-NEXT: movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1 -; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-NEXT: retq %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z @@ -946,214 +850,135 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm7, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pandn %xmm4, %xmm7 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 -; SSSE3-NEXT: paddd %xmm2, %xmm8 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: paddd %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm7, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pandn %xmm4, %xmm7 -; SSSE3-NEXT: psrld $1, %xmm2 -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pandn %xmm6, %xmm7 +; SSSE3-NEXT: psrld $1, %xmm5 +; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: pand %xmm0, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 -; SSSE3-NEXT: paddd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm1, %xmm5 -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: paddd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSSE3-NEXT: pandn %xmm6, %xmm2 +; SSSE3-NEXT: psrld $1, %xmm4 +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm3, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: paddd %xmm2, %xmm6 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE41-NEXT: pandn %xmm7, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm5, %xmm7 -; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm7 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm6 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE41-NEXT: pxor %xmm8, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE41-NEXT: pandn %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: paddd %xmm2, %xmm5 +; SSE41-NEXT: movaps {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: movaps %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: movaps %xmm6, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: movaps %xmm5, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm1, {{.*}}(%rip), %ymm3, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5 +; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vblendvps %ymm5, {{.*}}(%rip), %ymm6, %ymm6 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm6, %ymm5, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vblendvps %ymm0, %ymm3, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %ymm0, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k2} -; AVX512-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2} +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512-NEXT: retq %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z @@ -1162,378 +987,230 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm0 ; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm9, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm12 -; SSE2-NEXT: paddd %xmm4, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pandn %xmm9, %xmm11 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: por %xmm11, %xmm10 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm12, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm12, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: pandn %xmm9, %xmm10 ; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm11, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm11 -; SSE2-NEXT: pxor %xmm9, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm11 -; SSE2-NEXT: paddd %xmm5, %xmm8 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm12, %xmm4 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: pxor %xmm9, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm12, %xmm4 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm8 -; SSE2-NEXT: por %xmm6, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm9, %xmm5 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE2-NEXT: pandn %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm12, %xmm2 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: por %xmm2, %xmm10 -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pandn %xmm3, %xmm9 -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm9, %xmm4 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pandn %xmm7, %xmm3 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pxor %xmm12, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pxor %xmm9, %xmm12 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm12 -; SSSE3-NEXT: paddd %xmm4, %xmm11 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 +; SSSE3-NEXT: paddd %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 ; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm12, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm12, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm10 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pandn %xmm9, %xmm11 +; SSSE3-NEXT: psrld $1, %xmm10 +; SSSE3-NEXT: por %xmm11, %xmm10 +; SSSE3-NEXT: pand %xmm0, %xmm10 +; SSSE3-NEXT: pandn %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: paddd %xmm1, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm10 +; SSSE3-NEXT: pandn %xmm9, %xmm10 ; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pandn %xmm11, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11 -; SSSE3-NEXT: pxor %xmm9, %xmm11 +; SSSE3-NEXT: por %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm11 -; SSSE3-NEXT: paddd %xmm5, %xmm8 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm11, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm12, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: paddd %xmm2, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: paddd %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm8 -; SSSE3-NEXT: pxor %xmm9, %xmm8 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm12, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pandn %xmm2, %xmm8 -; SSSE3-NEXT: por %xmm6, %xmm8 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pandn %xmm9, %xmm5 +; SSSE3-NEXT: psrld $1, %xmm4 +; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pandn %xmm6, %xmm2 +; SSSE3-NEXT: por %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSSE3-NEXT: paddd %xmm7, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSSE3-NEXT: pxor %xmm10, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 -; SSSE3-NEXT: pandn %xmm2, %xmm9 -; SSSE3-NEXT: movdqa %xmm10, %xmm2 -; SSSE3-NEXT: pandn %xmm12, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm10 -; SSSE3-NEXT: por %xmm2, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pandn %xmm3, %xmm9 -; SSSE3-NEXT: por %xmm10, %xmm9 -; SSSE3-NEXT: movdqa %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 +; SSSE3-NEXT: paddd %xmm3, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: pandn %xmm9, %xmm4 +; SSSE3-NEXT: psrld $1, %xmm8 +; SSSE3-NEXT: por %xmm4, %xmm8 +; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: pandn %xmm7, %xmm3 +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm3, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm12 +; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: pxor %xmm11, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE41-NEXT: pxor %xmm10, %xmm11 -; SSE41-NEXT: pxor %xmm12, %xmm12 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE41-NEXT: pxor %xmm10, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm12, %xmm11 ; SSE41-NEXT: paddd %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm12, %xmm4 -; SSE41-NEXT: pandn %xmm11, %xmm4 -; SSE41-NEXT: movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm11, %xmm13 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm13 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm9 -; SSE41-NEXT: xorps %xmm13, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 -; SSE41-NEXT: pxor %xmm10, %xmm13 -; SSE41-NEXT: pxor %xmm14, %xmm14 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm14 -; SSE41-NEXT: pxor %xmm10, %xmm14 -; SSE41-NEXT: pcmpeqd %xmm14, %xmm13 -; SSE41-NEXT: paddd %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm14, %xmm4 -; SSE41-NEXT: pandn %xmm13, %xmm4 -; SSE41-NEXT: movaps %xmm11, %xmm5 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm13, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE41-NEXT: pxor %xmm10, %xmm13 -; SSE41-NEXT: xorps %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm13 -; SSE41-NEXT: paddd %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: pandn %xmm13, %xmm4 -; SSE41-NEXT: movaps %xmm11, %xmm5 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movaps {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: movaps %xmm10, %xmm2 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: paddd %xmm5, %xmm4 +; SSE41-NEXT: movaps %xmm10, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: paddd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE41-NEXT: pxor %xmm8, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm10 -; SSE41-NEXT: pandn %xmm0, %xmm10 +; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm12, %xmm3 +; SSE41-NEXT: paddd %xmm6, %xmm3 +; SSE41-NEXT: movaps %xmm10, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE41-NEXT: pxor %xmm6, %xmm12 +; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: paddd %xmm7, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pxor %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm11 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5 ; SSE41-NEXT: movaps %xmm9, %xmm0 +; SSE41-NEXT: movaps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm12, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm10 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm10, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm11 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm6, %xmm11 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 -; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm10, %xmm10 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm12, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-NEXT: vandnps %ymm8, %ymm4, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 ; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm10, %ymm7 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm4, %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm11 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm7, %xmm11 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm11 -; AVX1-NEXT: vpcmpgtd %xmm11, %xmm12, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm12, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendvps %ymm3, %ymm8, %ymm10, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm9, %ymm10 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm10, %ymm7, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm6 +; AVX1-NEXT: vblendvps %ymm6, %ymm8, %ymm9, %ymm7 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm7, %ymm6, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8 -; AVX2-NEXT: vpcmpeqd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm7 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm2, %ymm7, %ymm8, %ymm2 -; AVX2-NEXT: vblendvps %ymm5, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm3 -; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vblendvps %ymm3, %ymm7, %ymm8, %ymm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vblendvps %ymm4, %ymm5, %ymm6, %ymm7 +; AVX2-NEXT: vpcmpgtd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendvps %ymm0, %ymm7, %ymm4, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm2 +; AVX2-NEXT: vblendvps %ymm2, %ymm5, %ymm6, %ymm4 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vblendvps %ymm1, %ymm4, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %zmm0, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y) ret <16 x i32> %z @@ -1543,152 +1220,120 @@ ; SSE2-LABEL: v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 ; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 ; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm2 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: paddq %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: pandn %xmm4, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1698,56 +1343,36 @@ ; ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-NEXT: retq %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z @@ -1756,369 +1381,279 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-LABEL: v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE2-NEXT: paddq %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm9, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm11, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 ; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm10, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm11, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSSE3-NEXT: paddq %xmm2, %xmm9 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm10, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm10, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,0,3,2] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm10, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: paddq %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pandn %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm11, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm0, %xmm5 -; SSSE3-NEXT: pandn %xmm9, %xmm0 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: paddq %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm10, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm9, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] ; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: paddq %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm11, %xmm3 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pandn %xmm9, %xmm4 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: paddq %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm2 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE41-NEXT: pand %xmm7, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm7 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm2 -; SSE41-NEXT: pandn %xmm7, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm10, %xmm3 -; SSE41-NEXT: movdqa %xmm10, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqq %xmm3, %xmm4 -; SSE41-NEXT: pandn %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 -; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm7 +; SSE41-NEXT: por %xmm0, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqq %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpaddq %xmm2, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandnpd %ymm8, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vmovapd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm1, {{.*}}(%rip), %ymm3, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5 +; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vblendvpd %ymm5, {{.*}}(%rip), %ymm6, %ymm6 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm6, %ymm5, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2 -; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %ymm0, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k2} -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} +; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k0 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2} +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512-NEXT: retq %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -2128,687 +1663,513 @@ ; SSE2-LABEL: v8i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm0, %xmm12 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE2-NEXT: paddq %xmm4, %xmm13 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm10, %xmm1 ; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm14 -; SSE2-NEXT: pxor %xmm10, %xmm14 -; SSE2-NEXT: pcmpeqd %xmm14, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm14, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm11, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm11, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm12, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm13, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm12, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 ; SSE2-NEXT: paddq %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm12 ; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm13, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm15 -; SSE2-NEXT: pxor %xmm10, %xmm15 -; SSE2-NEXT: pcmpeqd %xmm15, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm13 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm15, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm13, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm11, %xmm5 ; SSE2-NEXT: pand %xmm12, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: paddq %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm12, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm11, %xmm5 -; SSE2-NEXT: pand %xmm12, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm5 +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 ; SSE2-NEXT: paddq %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm9, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm11, %xmm2 -; SSE2-NEXT: pand %xmm12, %xmm7 -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm13 +; SSSE3-NEXT: movdqa %xmm0, %xmm12 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSSE3-NEXT: paddq %xmm4, %xmm13 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm10 -; SSSE3-NEXT: pxor %xmm10, %xmm1 ; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: paddq %xmm4, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm14 -; SSSE3-NEXT: pxor %xmm10, %xmm14 -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,0,3,2] -; SSSE3-NEXT: pand %xmm1, %xmm11 -; SSSE3-NEXT: movdqa %xmm13, %xmm0 -; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm11, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm11, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm11, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm12, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm13, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm10, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSSE3-NEXT: pand %xmm11, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm12, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 ; SSSE3-NEXT: paddq %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm12 ; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm13, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm15 -; SSSE3-NEXT: pxor %xmm10, %xmm15 -; SSSE3-NEXT: pcmpeqd %xmm15, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm13 -; SSSE3-NEXT: movdqa %xmm8, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm14, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm12, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm15, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm13, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm11, %xmm5 ; SSSE3-NEXT: pand %xmm12, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm11, %xmm5 +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm1, %xmm5 ; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 ; SSSE3-NEXT: paddq %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm6 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: pand %xmm12, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2] -; SSSE3-NEXT: pand %xmm5, %xmm8 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm11, %xmm5 -; SSSE3-NEXT: pand %xmm12, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm5 +; SSSE3-NEXT: pand %xmm11, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm4, %xmm6 +; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 ; SSSE3-NEXT: paddq %xmm7, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm9, %xmm7 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm9, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm7 -; SSSE3-NEXT: pxor %xmm7, %xmm10 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,0,3,2] -; SSSE3-NEXT: pand %xmm10, %xmm5 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm11, %xmm2 -; SSSE3-NEXT: pand %xmm12, %xmm7 -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm5, %xmm7 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm6 ; SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: pand %xmm11, %xmm6 +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm11 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm8 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 +; SSE41-NEXT: movdqa %xmm0, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm12 +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE41-NEXT: por %xmm4, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE41-NEXT: pxor %xmm9, %xmm12 -; SSE41-NEXT: pxor %xmm10, %xmm11 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm12, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm11 ; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE41-NEXT: por %xmm4, %xmm11 -; SSE41-NEXT: pxor %xmm9, %xmm11 -; SSE41-NEXT: pcmpeqq %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm14 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm14, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: pcmpeqq %xmm11, %xmm4 -; SSE41-NEXT: pandn %xmm12, %xmm4 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm13 +; SSE41-NEXT: movdqa %xmm9, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm14 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm15 -; SSE41-NEXT: pxor %xmm9, %xmm15 -; SSE41-NEXT: pxor %xmm10, %xmm14 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm12 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm14 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE41-NEXT: por %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm13 +; SSE41-NEXT: por %xmm0, %xmm13 ; SSE41-NEXT: pxor %xmm9, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm15 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm14 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm14, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: pandn %xmm15, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm14, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm13, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm13 +; SSE41-NEXT: movdqa %xmm9, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm5 -; SSE41-NEXT: pxor %xmm9, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm9, %xmm6 -; SSE41-NEXT: pcmpeqq %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm14 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm12 ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: pcmpeqq %xmm6, %xmm4 -; SSE41-NEXT: pandn %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm5 +; SSE41-NEXT: movdqa %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm10, %xmm7 -; SSE41-NEXT: movdqa %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: pxor %xmm9, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm9 -; SSE41-NEXT: pandn %xmm4, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm9, %xmm7 ; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm12, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm10 -; AVX1-NEXT: vpcmpeqq %xmm8, %xmm10, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqq %xmm11, %xmm6, %xmm11 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 -; AVX1-NEXT: vpaddq %xmm9, %xmm7, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm4, %xmm10, %xmm10 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm12, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-NEXT: vandnpd %ymm8, %ymm4, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 ; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX1-NEXT: vmovapd {{.*#+}} ymm10 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm7, %ymm8, %ymm10, %ymm7 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm4, %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqq %xmm7, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm12, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqq %xmm11, %xmm7, %xmm11 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm11, %xmm12, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm12, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vandnpd %ymm9, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendvpd %ymm3, %ymm8, %ymm10, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovapd {{.*#+}} ymm9 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vblendvpd %ymm7, %ymm8, %ymm9, %ymm10 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm10, %ymm7, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm6 +; AVX1-NEXT: vblendvpd %ymm6, %ymm8, %ymm9, %ymm7 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorpd %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm7, %ymm6, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqq %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8 -; AVX2-NEXT: vpcmpeqq %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm2, %ymm7, %ymm8, %ymm2 -; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm3 -; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4 -; AVX2-NEXT: vpcmpeqq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vblendvpd %ymm3, %ymm7, %ymm8, %ymm3 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm6, %ymm7 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm4, %ymm0 +; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm6, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm4, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y) ret <8 x i64> %z Index: llvm/test/CodeGen/X86/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat.ll +++ llvm/test/CodeGen/X86/ssub_sat.ll @@ -183,30 +183,20 @@ ; ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-NEXT: pxor %xmm4, %xmm0 -; X64-NEXT: pxor %xmm5, %xmm5 -; X64-NEXT: pcmpgtd %xmm2, %xmm5 -; X64-NEXT: pxor %xmm4, %xmm5 -; X64-NEXT: pcmpeqd %xmm5, %xmm0 -; X64-NEXT: psubd %xmm1, %xmm2 -; X64-NEXT: pcmpgtd %xmm2, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: pxor %xmm4, %xmm1 -; X64-NEXT: pcmpeqd %xmm5, %xmm1 -; X64-NEXT: pxor %xmm4, %xmm1 -; X64-NEXT: pandn %xmm1, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psubd %xmm1, %xmm3 +; X64-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-NEXT: pcmpgtd %xmm3, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: pcmpgtd %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm1 ; X64-NEXT: pandn {{.*}}(%rip), %xmm1 -; X64-NEXT: psrld $1, %xmm3 -; X64-NEXT: por %xmm1, %xmm3 -; X64-NEXT: pand %xmm0, %xmm3 -; X64-NEXT: pandn %xmm2, %xmm0 -; X64-NEXT: por %xmm3, %xmm0 +; X64-NEXT: psrld $1, %xmm2 +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: pandn %xmm3, %xmm0 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y); ret <4 x i32> %tmp; Index: llvm/test/CodeGen/X86/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -569,171 +569,135 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: psllq $32, %xmm1 ; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psllq $32, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: psllq $32, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: psllq $32, %xmm1 ; SSSE3-NEXT: psubq %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm6, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm2 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm0, %xmm4 ; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm0, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: psllq $32, %xmm1 ; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pandn %xmm3, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -748,22 +712,14 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -773,22 +729,14 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] @@ -798,19 +746,16 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 ; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} -; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vpsraq $32, %xmm1, %xmm0 ; AVX512-NEXT: retq %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z @@ -819,141 +764,94 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: psubd %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: psrld $1, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: psrld $1, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE41-NEXT: psubd %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pandn %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psubd %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1 -; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} +; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-NEXT: retq %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z @@ -962,226 +860,144 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm2 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm5, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pxor %xmm8, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm0 -; SSSE3-NEXT: psubd %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm7, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm0, %xmm6 -; SSSE3-NEXT: pandn %xmm4, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSSE3-NEXT: psubd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm7, %xmm3 -; SSSE3-NEXT: psrld $1, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: psubd %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pandn %xmm6, %xmm7 +; SSSE3-NEXT: psrld $1, %xmm2 +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psubd %xmm3, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm3 +; SSSE3-NEXT: psrld $1, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm8, %xmm8 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE41-NEXT: pxor %xmm8, %xmm6 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm8, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: psubd %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm8, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm8, %xmm2 -; SSE41-NEXT: pandn %xmm2, %xmm6 -; SSE41-NEXT: movaps {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm7, %xmm2 -; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm8, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm8, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: psubd %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm8, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm8, %xmm3 -; SSE41-NEXT: pandn %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movaps {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: movaps %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm7 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psubd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2 ; SSE41-NEXT: movaps %xmm5, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandnps %ymm2, %ymm8, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm1, {{.*}}(%rip), %ymm3, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vblendvps %ymm1, {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %ymm0, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k2} -; AVX512-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} +; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2} +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512-NEXT: retq %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z @@ -1190,399 +1006,244 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm11 -; SSE2-NEXT: pxor %xmm10, %xmm11 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: psubd %xmm4, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm11, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm12, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 -; SSE2-NEXT: pxor %xmm10, %xmm12 -; SSE2-NEXT: pcmpeqd %xmm12, %xmm1 -; SSE2-NEXT: psubd %xmm5, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pxor %xmm10, %xmm8 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: psubd %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pandn %xmm6, %xmm8 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm11, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm11 +; SSE2-NEXT: pandn %xmm10, %xmm11 ; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: por %xmm11, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm9, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm11, %xmm2 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: por %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pandn %xmm10, %xmm9 +; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: por %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psubd %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm10, %xmm6 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psubd %xmm7, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm5 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm12 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm10 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm11 -; SSSE3-NEXT: pxor %xmm10, %xmm11 -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: psubd %xmm4, %xmm12 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pandn %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm11, %xmm4 -; SSSE3-NEXT: psrld $1, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm12, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pxor %xmm12, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm12 -; SSSE3-NEXT: pxor %xmm10, %xmm12 -; SSSE3-NEXT: pcmpeqd %xmm12, %xmm1 -; SSSE3-NEXT: psubd %xmm5, %xmm8 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm12, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pandn %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm11, %xmm5 -; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 -; SSSE3-NEXT: pxor %xmm10, %xmm8 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 -; SSSE3-NEXT: psubd %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm9 +; SSSE3-NEXT: psubd %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: pandn %xmm6, %xmm8 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm11, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm4, %xmm11 +; SSSE3-NEXT: pandn %xmm10, %xmm11 ; SSSE3-NEXT: psrld $1, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: pandn %xmm2, %xmm8 -; SSSE3-NEXT: por %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm11, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm9, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: psubd %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSSE3-NEXT: psubd %xmm7, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm9 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm2 -; SSSE3-NEXT: pandn %xmm11, %xmm2 -; SSSE3-NEXT: psrld $1, %xmm9 -; SSSE3-NEXT: por %xmm2, %xmm9 -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pandn %xmm10, %xmm9 +; SSSE3-NEXT: psrld $1, %xmm5 ; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: psubd %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm10, %xmm6 +; SSSE3-NEXT: psrld $1, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psubd %xmm7, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm7, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm5 +; SSSE3-NEXT: psrld $1, %xmm8 +; SSSE3-NEXT: por %xmm5, %xmm8 +; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm10, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm11, %xmm11 -; SSE41-NEXT: pxor %xmm11, %xmm10 -; SSE41-NEXT: pxor %xmm12, %xmm12 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE41-NEXT: pxor %xmm11, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm12, %xmm10 +; SSE41-NEXT: movdqa %xmm0, %xmm9 ; SSE41-NEXT: psubd %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm11, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm12, %xmm4 -; SSE41-NEXT: pxor %xmm11, %xmm4 -; SSE41-NEXT: pandn %xmm4, %xmm10 -; SSE41-NEXT: movaps {{.*#+}} xmm13 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movaps {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movaps %xmm12, %xmm4 -; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm4 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: movaps %xmm11, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: xorps %xmm4, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm11, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE41-NEXT: pxor %xmm11, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE41-NEXT: psubd %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm11, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pxor %xmm11, %xmm5 -; SSE41-NEXT: pandn %xmm5, %xmm4 -; SSE41-NEXT: movaps %xmm12, %xmm5 -; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm11, %xmm4 -; SSE41-NEXT: xorps %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm11, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: psubd %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm11, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE41-NEXT: pxor %xmm11, %xmm6 -; SSE41-NEXT: pandn %xmm6, %xmm4 -; SSE41-NEXT: movaps %xmm12, %xmm5 -; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psubd %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm11, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE41-NEXT: pxor %xmm11, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: psubd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psubd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: movaps %xmm11, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: pxor %xmm11, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm11, %xmm5 -; SSE41-NEXT: pandn %xmm5, %xmm4 +; SSE41-NEXT: psubd %xmm7, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pxor %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm11 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm12 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm3 +; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movaps %xmm9, %xmm0 +; SSE41-NEXT: movaps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm10, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm12 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm12, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm10, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm11 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm10, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm5, %xmm11 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 -; AVX1-NEXT: vpsubd %xmm9, %xmm7, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm11 -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm10, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 -; AVX1-NEXT: vandnps %ymm5, %ymm8, %ymm5 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7 -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX1-NEXT: vmovaps {{.*#+}} ymm11 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm11, %ymm7 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm5, %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm10, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm12 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm12 -; AVX1-NEXT: vpcmpgtd %xmm12, %xmm10, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm10, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vandnps %ymm2, %ymm9, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-NEXT: vblendvps %ymm3, %ymm8, %ymm11, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm6, %ymm7 +; AVX1-NEXT: vblendvps %ymm0, %ymm7, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm7, %xmm7 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm5, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm6, %ymm3 +; AVX1-NEXT: vblendvps %ymm1, %ymm3, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8 -; AVX2-NEXT: vpcmpeqd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpandn %ymm7, %ymm5, %ymm5 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm7 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vblendvps %ymm2, %ymm7, %ymm8, %ymm2 -; AVX2-NEXT: vblendvps %ymm5, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm3 -; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vblendvps %ymm3, %ymm7, %ymm8, %ymm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vblendvps %ymm2, %ymm5, %ymm6, %ymm7 +; AVX2-NEXT: vblendvps %ymm0, %ymm7, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendvps %ymm3, %ymm5, %ymm6, %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %zmm0, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y) ret <16 x i32> %z @@ -1592,50 +1253,38 @@ ; SSE2-LABEL: v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 @@ -1648,50 +1297,38 @@ ; SSSE3-LABEL: v2i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: psubq %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm2 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm2, %xmm3 @@ -1704,46 +1341,32 @@ ; SSE41-LABEL: v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pandn %xmm3, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1754,57 +1377,39 @@ ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} +; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-NEXT: retq %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z @@ -1813,381 +1418,285 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-LABEL: v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE2-NEXT: psubq %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: psubq %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm9, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm11, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm10, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm9, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 ; SSE2-NEXT: psubq %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSSE3-NEXT: psubq %xmm2, %xmm10 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: psubq %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pxor %xmm9, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm10, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm9, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm11, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pandn %xmm10, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm9, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 ; SSSE3-NEXT: psubq %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pandn %xmm9, %xmm4 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] -; SSE41-NEXT: psubq %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm11, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: psubq %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE41-NEXT: pxor %xmm10, %xmm2 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pcmpeqq %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: pandn %xmm5, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psubq %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm11, %xmm3 -; SSE41-NEXT: movdqa %xmm11, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm10, %xmm2 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm10, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm3, %xmm2 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pcmpeqq %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pandn %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: psubq %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm7 +; SSE41-NEXT: por %xmm0, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqq %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandnpd %ymm2, %ymm8, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vmovapd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm1, {{.*}}(%rip), %ymm3, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vblendvpd %ymm1, {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2 -; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %ymm0, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k2} -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} +; AVX512-NEXT: vpcmpgtq %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2} +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512-NEXT: retq %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -2197,414 +1706,324 @@ ; SSE2-LABEL: v8i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm0, %xmm12 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE2-NEXT: psubq %xmm4, %xmm13 +; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: psubq %xmm4, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 ; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm15 -; SSE2-NEXT: pxor %xmm10, %xmm15 -; SSE2-NEXT: pcmpeqd %xmm15, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm14, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: pxor %xmm10, %xmm11 -; SSE2-NEXT: pcmpeqd %xmm15, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] ; SSE2-NEXT: pand %xmm11, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm11, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm12, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm13, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm12, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 ; SSE2-NEXT: psubq %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm12 ; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm13, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: pxor %xmm10, %xmm13 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,0,3,2] -; SSE2-NEXT: pand %xmm13, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm11, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: psubq %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm12, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pandn %xmm6, %xmm8 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm11, %xmm5 -; SSE2-NEXT: pand %xmm12, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psubq %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm5 +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: psubq %xmm7, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm11, %xmm2 -; SSE2-NEXT: pand %xmm12, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm13 +; SSSE3-NEXT: movdqa %xmm0, %xmm12 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSSE3-NEXT: psubq %xmm4, %xmm13 +; SSSE3-NEXT: pxor %xmm9, %xmm0 +; SSSE3-NEXT: psubq %xmm4, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm10 ; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm10 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pand %xmm11, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm15 -; SSSE3-NEXT: pxor %xmm10, %xmm15 -; SSSE3-NEXT: pcmpeqd %xmm15, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm13, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm14, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm11 -; SSSE3-NEXT: pxor %xmm10, %xmm11 -; SSSE3-NEXT: pcmpeqd %xmm15, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm10, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] ; SSSE3-NEXT: pand %xmm11, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pandn %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm11, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm12, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm13, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm12, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 ; SSSE3-NEXT: psubq %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm12 ; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm13, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm9, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm12, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm14, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm13 -; SSSE3-NEXT: pxor %xmm10, %xmm13 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,0,3,2] -; SSSE3-NEXT: pand %xmm13, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pandn %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm11, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm12, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm11, %xmm5 +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm1, %xmm5 ; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 ; SSSE3-NEXT: psubq %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm6 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: pand %xmm12, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2] -; SSSE3-NEXT: pand %xmm5, %xmm8 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: pandn %xmm6, %xmm8 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm11, %xmm5 -; SSSE3-NEXT: pand %xmm12, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: pandn %xmm2, %xmm8 -; SSSE3-NEXT: por %xmm4, %xmm8 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: psubq %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm7 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm5 +; SSSE3-NEXT: pand %xmm11, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm4, %xmm6 +; SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: psubq %xmm7, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm9, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: pandn %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pandn %xmm11, %xmm2 -; SSSE3-NEXT: pand %xmm12, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: pand %xmm11, %xmm6 +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: retq ; @@ -2612,293 +2031,198 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm11 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 +; SSE41-NEXT: movdqa %xmm0, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm12 +; SSE41-NEXT: por %xmm0, %xmm12 ; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] -; SSE41-NEXT: pand %xmm10, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3] -; SSE41-NEXT: por %xmm12, %xmm15 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE41-NEXT: pxor %xmm10, %xmm15 -; SSE41-NEXT: pxor %xmm9, %xmm11 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSE41-NEXT: pand %xmm12, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE41-NEXT: por %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm10, %xmm12 -; SSE41-NEXT: pcmpeqq %xmm12, %xmm15 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm12, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm14 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3] -; SSE41-NEXT: por %xmm14, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pcmpeqq %xmm12, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pandn %xmm4, %xmm15 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm15, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: movdqa %xmm9, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm12 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm13 +; SSE41-NEXT: por %xmm0, %xmm13 ; SSE41-NEXT: pxor %xmm9, %xmm5 -; SSE41-NEXT: movdqa %xmm9, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm14, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm13, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm13 ; SSE41-NEXT: movdqa %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpgtd %xmm12, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm14 -; SSE41-NEXT: pxor %xmm10, %xmm14 -; SSE41-NEXT: pcmpeqq %xmm14, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm15 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3] -; SSE41-NEXT: por %xmm15, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm14, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: pandn %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm12 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm9, %xmm6 -; SSE41-NEXT: movdqa %xmm9, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm12, %xmm5 ; SSE41-NEXT: movdqa %xmm9, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm14 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE41-NEXT: por %xmm14, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm6 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm6 -; SSE41-NEXT: pandn %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm7, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 ; SSE41-NEXT: pxor %xmm9, %xmm7 -; SSE41-NEXT: movdqa %xmm9, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm13, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm6 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm6 -; SSE41-NEXT: pandn %xmm6, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm10, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm12 -; AVX1-NEXT: vpcmpeqq %xmm8, %xmm12, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm10, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm10, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm11, %xmm5, %xmm11 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 -; AVX1-NEXT: vpsubq %xmm9, %xmm7, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpeqq %xmm6, %xmm12, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm11 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm10, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqq %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 -; AVX1-NEXT: vandnpd %ymm5, %ymm8, %ymm5 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7 -; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vblendvpd %ymm7, %ymm8, %ymm11, %ymm7 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm5, %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm10, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqq %xmm7, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqq %xmm12, %xmm7, %xmm12 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm12 -; AVX1-NEXT: vpcmpgtq %xmm12, %xmm10, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm10, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpeqq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vandnpd %ymm2, %ymm9, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-NEXT: vblendvpd %ymm3, %ymm8, %ymm11, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] +; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm6, %ymm7 +; AVX1-NEXT: vblendvpd %ymm0, %ymm7, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpsubq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm7 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-NEXT: vxorpd %ymm1, %ymm5, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm6, %ymm3 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqq %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8 -; AVX2-NEXT: vpcmpeqq %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpandn %ymm7, %ymm5, %ymm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %ymm2, %ymm7, %ymm8, %ymm2 -; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm3 -; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4 -; AVX2-NEXT: vpcmpeqq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm3, %ymm7, %ymm8, %ymm3 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm6, %ymm7 +; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm6, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vpcmpgtq %zmm2, %zmm1, %k0 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y) ret <8 x i64> %z Index: llvm/test/CodeGen/X86/vec_saddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_saddo.ll +++ llvm/test/CodeGen/X86/vec_saddo.ll @@ -190,82 +190,45 @@ ; SSE2-LABEL: saddo_v3i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, 8(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: saddo_v3i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: paddd %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, 8(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: saddo_v3i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE41-NEXT: pandn %xmm3, %xmm2 -; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) +; SSE41-NEXT: movq %xmm1, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: saddo_v3i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq @@ -273,17 +236,10 @@ ; AVX2-LABEL: saddo_v3i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq @@ -291,13 +247,10 @@ ; AVX512-LABEL: saddo_v3i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) @@ -315,67 +268,40 @@ ; SSE-LABEL: saddo_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: saddo_v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: saddo_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: saddo_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -392,164 +318,118 @@ ; SSE2-LABEL: saddo_v6i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm2 -; SSE2-NEXT: movq %xmm1, 16(%rcx) -; SSE2-NEXT: movdqa %xmm0, (%rcx) -; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movq %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movq %xmm5, 16(%rdi) +; SSE2-NEXT: movdqa %xmm6, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: saddo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 -; SSSE3-NEXT: paddd %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm4 -; SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: paddd %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 -; SSSE3-NEXT: paddd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm6, %xmm2 -; SSSE3-NEXT: movq %xmm1, 16(%rcx) -; SSSE3-NEXT: movdqa %xmm0, (%rcx) -; SSSE3-NEXT: movq %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: paddd %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movq %xmm2, 16(%rcx) +; SSSE3-NEXT: movdqa %xmm4, (%rcx) +; SSSE3-NEXT: movq %xmm5, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm6, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: saddo_v6i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: movd %esi, %xmm4 -; SSE41-NEXT: pinsrd $1, %edx, %xmm4 -; SSE41-NEXT: pinsrd $2, %ecx, %xmm4 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm4 -; SSE41-NEXT: movd %r9d, %xmm2 -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movd %esi, %xmm1 +; SSE41-NEXT: pinsrd $1, %edx, %xmm1 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 +; SSE41-NEXT: movd %r9d, %xmm0 ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: paddd %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE41-NEXT: pandn %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: paddd %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE41-NEXT: pandn %xmm6, %xmm3 -; SSE41-NEXT: movq %xmm0, 16(%rcx) -; SSE41-NEXT: movdqa %xmm1, (%rcx) -; SSE41-NEXT: movq %xmm3, 16(%rdi) -; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm1, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movq %xmm2, 16(%rcx) +; SSE41-NEXT: movdqa %xmm4, (%rcx) +; SSE41-NEXT: movq %xmm0, 16(%rdi) +; SSE41-NEXT: movdqa %xmm6, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: saddo_v6i32: @@ -557,28 +437,15 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm8, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq @@ -586,17 +453,10 @@ ; AVX2-LABEL: saddo_v6i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vmovq %xmm2, 16(%rdi) ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) @@ -605,13 +465,10 @@ ; AVX512-LABEL: saddo_v6i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -629,37 +486,18 @@ define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { ; SSE-LABEL: saddo_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE-NEXT: pxor %xmm5, %xmm7 -; SSE-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm5, %xmm2 -; SSE-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE-NEXT: pxor %xmm5, %xmm7 -; SSE-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE-NEXT: paddd %xmm3, %xmm4 -; SSE-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: pcmpeqd %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm4, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE-NEXT: paddd %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm5, %xmm0 +; SSE-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm3, 16(%rdi) +; SSE-NEXT: movdqa %xmm2, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: saddo_v8i32: @@ -667,28 +505,15 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm8, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 ; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq @@ -696,30 +521,20 @@ ; AVX2-LABEL: saddo_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: saddo_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %ymm1, (%rdi) @@ -735,132 +550,70 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { ; SSE-LABEL: saddo_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pxor %xmm11, %xmm11 -; SSE-NEXT: pcmpgtd %xmm4, %xmm11 -; SSE-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE-NEXT: pxor %xmm10, %xmm11 -; SSE-NEXT: pxor %xmm12, %xmm12 -; SSE-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE-NEXT: pxor %xmm10, %xmm12 -; SSE-NEXT: pcmpeqd %xmm12, %xmm11 -; SSE-NEXT: paddd %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm8, %xmm8 ; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE-NEXT: pxor %xmm10, %xmm9 -; SSE-NEXT: pcmpeqd %xmm12, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: pxor %xmm12, %xmm12 -; SSE-NEXT: pcmpgtd %xmm5, %xmm12 -; SSE-NEXT: pxor %xmm10, %xmm12 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE-NEXT: pxor %xmm10, %xmm4 -; SSE-NEXT: pcmpeqd %xmm4, %xmm12 -; SSE-NEXT: paddd %xmm5, %xmm1 -; SSE-NEXT: pxor %xmm11, %xmm11 -; SSE-NEXT: pcmpgtd %xmm1, %xmm11 -; SSE-NEXT: pxor %xmm10, %xmm11 -; SSE-NEXT: pcmpeqd %xmm4, %xmm11 -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE-NEXT: pxor %xmm10, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm10, %xmm5 -; SSE-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm10, %xmm6 -; SSE-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE-NEXT: pxor %xmm10, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE-NEXT: pxor %xmm10, %xmm5 -; SSE-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE-NEXT: paddd %xmm7, %xmm8 -; SSE-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE-NEXT: pxor %xmm10, %xmm3 -; SSE-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm8, 48(%rdi) -; SSE-NEXT: movdqa %xmm2, 32(%rdi) -; SSE-NEXT: movdqa %xmm1, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE-NEXT: paddd %xmm0, %xmm4 +; SSE-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE-NEXT: paddd %xmm1, %xmm5 +; SSE-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm9, %xmm1 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: pcmpgtd %xmm6, %xmm9 +; SSE-NEXT: paddd %xmm2, %xmm6 +; SSE-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm9, %xmm2 +; SSE-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE-NEXT: paddd %xmm3, %xmm7 +; SSE-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm7, 48(%rdi) +; SSE-NEXT: movdqa %xmm6, 32(%rdi) +; SSE-NEXT: movdqa %xmm5, 16(%rdi) +; SSE-NEXT: movdqa %xmm4, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: saddo_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm4, %xmm8 -; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm8, %xmm4, %xmm8 +; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm7 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm10 -; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm1 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpandn %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm3 -; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm7 -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm1 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpandn %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm3 -; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm0 -; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vmovdqa %xmm9, 48(%rdi) -; AVX1-NEXT: vmovdqa %xmm10, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; @@ -868,28 +621,15 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpandn %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm4 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0 -; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0 -; AVX2-NEXT: vpandn %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 @@ -903,13 +643,10 @@ ; AVX512-LABEL: saddo_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k0 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq @@ -1155,185 +892,62 @@ } define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { -; SSE2-LABEL: saddo_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: saddo_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: paddq %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: saddo_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 -; SSE41-NEXT: pandn %xmm4, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: saddo_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: paddq %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: saddo_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: saddo_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: saddo_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandnw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vec_ssubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ssubo.ll +++ llvm/test/CodeGen/X86/vec_ssubo.ll @@ -189,87 +189,49 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { ; SSE2-LABEL: ssubo_v3i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, 8(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm3, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v3i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: psubd %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, 8(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: movq %xmm3, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v3i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: pandn %xmm3, %xmm2 -; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psubd %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm3, 8(%rdi) +; SSE41-NEXT: movq %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: ssubo_v3i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq @@ -277,18 +239,10 @@ ; AVX2-LABEL: ssubo_v3i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq @@ -296,13 +250,10 @@ ; AVX512-LABEL: ssubo_v3i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) @@ -319,71 +270,42 @@ define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { ; SSE-LABEL: ssubo_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE-NEXT: psubd %xmm1, %xmm0 -; SSE-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psubd %xmm1, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm3, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: ssubo_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -400,201 +322,132 @@ ; SSE2-LABEL: ssubo_v6i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE2-NEXT: movd %r9d, %xmm1 -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE2-NEXT: psubd %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: movq %xmm1, 16(%rcx) -; SSE2-NEXT: movdqa %xmm0, (%rcx) -; SSE2-NEXT: movq %xmm6, 16(%rdi) -; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psubd %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movq %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movq %xmm2, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSSE3-NEXT: movd %r9d, %xmm1 -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 -; SSSE3-NEXT: psubd %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 -; SSSE3-NEXT: psubd %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm3, %xmm6 -; SSSE3-NEXT: movq %xmm1, 16(%rcx) -; SSSE3-NEXT: movdqa %xmm0, (%rcx) -; SSSE3-NEXT: movq %xmm6, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psubd %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: psubd %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: movq %xmm3, 16(%rcx) +; SSSE3-NEXT: movdqa %xmm4, (%rcx) +; SSSE3-NEXT: movq %xmm2, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v6i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pinsrd $1, %edx, %xmm0 -; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 -; SSE41-NEXT: movd %r9d, %xmm1 -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movd %esi, %xmm1 +; SSE41-NEXT: pinsrd $1, %edx, %xmm1 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 +; SSE41-NEXT: movd %r9d, %xmm0 +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm6 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE41-NEXT: pxor %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE41-NEXT: psubd %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: pandn %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: psubd %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: pandn %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psubd %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movq %xmm1, 16(%rcx) -; SSE41-NEXT: movdqa %xmm0, (%rcx) -; SSE41-NEXT: movq %xmm6, 16(%rdi) -; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: movdqa %xmm4, (%rcx) +; SSE41-NEXT: movq %xmm0, 16(%rdi) +; SSE41-NEXT: movdqa %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: ssubo_v6i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq @@ -602,18 +455,10 @@ ; AVX2-LABEL: ssubo_v6i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vmovq %xmm2, 16(%rdi) ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) @@ -622,13 +467,10 @@ ; AVX512-LABEL: ssubo_v6i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -646,70 +488,35 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { ; SSE-LABEL: ssubo_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE-NEXT: pxor %xmm6, %xmm4 -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE-NEXT: pxor %xmm6, %xmm7 -; SSE-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE-NEXT: psubd %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE-NEXT: pxor %xmm6, %xmm7 -; SSE-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE-NEXT: psubd %xmm3, %xmm1 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm6, %xmm5 -; SSE-NEXT: pcmpeqd %xmm7, %xmm5 -; SSE-NEXT: pxor %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm1, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psubd %xmm2, %xmm5 +; SSE-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psubd %xmm3, %xmm2 +; SSE-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, 16(%rdi) +; SSE-NEXT: movdqa %xmm5, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 ; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq @@ -717,31 +524,20 @@ ; AVX2-LABEL: ssubo_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: ssubo_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %ymm1, (%rdi) @@ -757,128 +553,59 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { ; SSE-LABEL: ssubo_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE-NEXT: pcmpeqd %xmm11, %xmm11 -; SSE-NEXT: pxor %xmm11, %xmm8 ; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE-NEXT: pxor %xmm11, %xmm9 -; SSE-NEXT: pcmpeqd %xmm9, %xmm8 -; SSE-NEXT: psubd %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE-NEXT: pxor %xmm11, %xmm4 -; SSE-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE-NEXT: pxor %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE-NEXT: pxor %xmm11, %xmm9 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE-NEXT: pxor %xmm11, %xmm4 -; SSE-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE-NEXT: psubd %xmm5, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm11, %xmm5 -; SSE-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE-NEXT: pxor %xmm11, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE-NEXT: pxor %xmm11, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm11, %xmm5 -; SSE-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE-NEXT: psubd %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm11, %xmm6 -; SSE-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE-NEXT: pxor %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE-NEXT: pxor %xmm11, %xmm5 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE-NEXT: pxor %xmm11, %xmm6 -; SSE-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE-NEXT: psubd %xmm7, %xmm3 -; SSE-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE-NEXT: pxor %xmm11, %xmm10 -; SSE-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE-NEXT: pxor %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm3, 48(%rdi) -; SSE-NEXT: movdqa %xmm2, 32(%rdi) -; SSE-NEXT: movdqa %xmm1, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: psubd %xmm4, %xmm8 +; SSE-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psubd %xmm5, %xmm4 +; SSE-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: psubd %xmm6, %xmm5 +; SSE-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: psubd %xmm7, %xmm6 +; SSE-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm6, 48(%rdi) +; SSE-NEXT: movdqa %xmm5, 32(%rdi) +; SSE-NEXT: movdqa %xmm4, 16(%rdi) +; SSE-NEXT: movdqa %xmm8, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm9, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm10 -; AVX1-NEXT: vpcmpgtd %xmm10, %xmm9, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpandn %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm9, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm9, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm7 +; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm9, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm9, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm9, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm9, %xmm6 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm9, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 @@ -888,40 +615,25 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vmovdqa %xmm10, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm7, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpandn %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7 -; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm4 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0 -; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0 -; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 @@ -935,13 +647,10 @@ ; AVX512-LABEL: ssubo_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 -; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq @@ -1187,193 +896,62 @@ } define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { -; SSE2-LABEL: ssubo_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: ssubo_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psubq %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ssubo_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm6, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pandn %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ssubo_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: psubq %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: ssubo_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 -; AVX512-NEXT: kxorw %k0, %k1, %k0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq