diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15546,6 +15546,27 @@ return SDValue(); } +static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && + "Unexpected opcode!"); + + SDValue Pred = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + ISD::CondCode Cond = cast(N->getOperand(3))->get(); + + // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne + // => inner setcc_merge_zero + if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && + LHS->getOpcode() == ISD::SIGN_EXTEND && + LHS->getOperand(0)->getValueType(0) == N->getValueType(0) && + LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && + LHS->getOperand(0)->getOperand(0) == Pred) + return LHS->getOperand(0); + + return SDValue(); +} + // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test // as well as whether the test should be inverted. This code is required to // catch these cases (as opposed to standard dag combines) because @@ -16406,6 +16427,8 @@ return performSpliceCombine(N, DAG); case AArch64ISD::UZP1: return performUzpCombine(N, DAG); + case AArch64ISD::SETCC_MERGE_ZERO: + return performSetccMergeZeroCombine(N, DAG); case AArch64ISD::GLD1_MERGE_ZERO: case AArch64ISD::GLD1_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_MERGE_ZERO: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -90,8 +90,6 @@ ; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; CHECK-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; CHECK-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; CHECK-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x0] ; CHECK-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; CHECK-NEXT: ret @@ -108,8 +106,6 @@ ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -126,8 +122,6 @@ ; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_1024-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_1024-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_1024-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_1024-NEXT: ret @@ -144,8 +138,6 @@ ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_2048-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_2048-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_2048-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_2048-NEXT: ret @@ -163,8 +155,6 @@ ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b -; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].b, [[PG0]]/z, [[Z0]].b, #0 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -181,8 +171,6 @@ ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].h, [[PG0]]/z, [[Z0]].h, #0 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG0]], [x8] ; VBITS_GE_512: ret @@ -199,8 +187,6 @@ ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -217,8 +203,6 @@ ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -235,8 +219,6 @@ ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8] @@ -254,8 +236,6 @@ ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8] @@ -273,12 +253,10 @@ ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b -; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0 -; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b -; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %b = load <32 x i8>, <32 x i8>* %bp @@ -337,12 +315,10 @@ ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0 -; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h -; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <16 x i16>, <16 x i16>* %ap %b = load <16 x i16>, <16 x i16>* %bp @@ -379,12 +355,10 @@ ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0 -; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s -; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <8 x i32>, <8 x i32>* %ap %b = load <8 x i32>, <8 x i32>* %bp @@ -400,12 +374,10 @@ ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b -; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0 -; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b -; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %b = load <32 x i8>, <32 x i8>* %bp @@ -464,12 +436,10 @@ ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0 -; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h -; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <16 x i16>, <16 x i16>* %ap %b = load <16 x i16>, <16 x i16>* %bp @@ -506,12 +476,10 @@ ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0 -; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s -; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <8 x i32>, <8 x i32>* %ap %b = load <8 x i32>, <8 x i32>* %bp diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -91,9 +91,7 @@ ; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; CHECK-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1 -; CHECK-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z2]].s, #0 -; CHECK-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; CHECK-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; CHECK-NEXT: ret %a = load <8 x float>, <8 x float>* %ap %b = load <8 x float>, <8 x float>* %bp @@ -108,9 +106,7 @@ ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0 -; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ret %a = load <16 x float>, <16 x float>* %ap %b = load <16 x float>, <16 x float>* %bp @@ -125,9 +121,7 @@ ; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_1024-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1 -; VBITS_GE_1024-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0 -; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; VBITS_GE_1024-NEXT: ret %a = load <32 x float>, <32 x float>* %ap %b = load <32 x float>, <32 x float>* %bp @@ -142,9 +136,7 @@ ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_2048-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1 -; VBITS_GE_2048-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0 -; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; VBITS_GE_2048-NEXT: ret %a = load <64 x float>, <64 x float>* %ap %b = load <64 x float>, <64 x float>* %bp