diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21246,7 +21246,7 @@ auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); - if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { + if (VT.isFloatingPoint()) { LoadVT = ContainerVT.changeTypeToInteger(); MemVT = MemVT.changeTypeToInteger(); } @@ -21264,6 +21264,8 @@ Result = getSVESafeBitCast(ExtendVT, Result, DAG); Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, Pg, Result, DAG.getUNDEF(ContainerVT)); + } else if (VT.isFloatingPoint()) { + Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result); } Result = convertFromScalableVector(DAG, VT, Result); @@ -21354,6 +21356,10 @@ DAG.getUNDEF(TruncVT)); NewValue = getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); + } else if (VT.isFloatingPoint()) { + MemVT = MemVT.changeTypeToInteger(); + NewValue = + getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); } return DAG.getMaskedStore(Store->getChain(), DL, NewValue, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -161,10 +161,8 @@ ; ; VBITS_GE_512-LABEL: fcvtzu_v16f16_v16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzu z0.s, p0/m, z0.h ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret @@ -177,10 +175,8 @@ define void @fcvtzu_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: fcvtzu_v32f16_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret @@ -193,10 +189,8 @@ define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: fcvtzu_v64f16_v64i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret @@ -287,11 +281,8 @@ define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: fcvtzu_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -304,11 +295,8 @@ define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: fcvtzu_v32f16_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -579,10 +567,8 @@ ; ; VBITS_GE_512-LABEL: fcvtzu_v8f32_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.s ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret @@ -595,10 +581,8 @@ define void @fcvtzu_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: fcvtzu_v16f32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -611,10 +595,8 @@ define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: fcvtzu_v32f32_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -1087,10 +1069,8 @@ ; ; VBITS_GE_512-LABEL: fcvtzs_v16f16_v16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzs z0.s, p0/m, z0.h ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret @@ -1103,10 +1083,8 @@ define void @fcvtzs_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: fcvtzs_v32f16_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret @@ -1119,10 +1097,8 @@ define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: fcvtzs_v64f16_v64i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret @@ -1213,11 +1189,8 @@ define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: fcvtzs_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -1230,11 +1203,8 @@ define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: fcvtzs_v32f16_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -1505,10 +1475,8 @@ ; ; VBITS_GE_512-LABEL: fcvtzs_v8f32_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.s ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret @@ -1521,10 +1489,8 @@ define void @fcvtzs_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 { ; CHECK-LABEL: fcvtzs_v16f32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -1537,10 +1503,8 @@ define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 { ; CHECK-LABEL: fcvtzs_v32f32_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -696,10 +696,9 @@ ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a %res = uitofp <16 x i64> %op1 to <16 x half> @@ -714,10 +713,9 @@ ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a %res = uitofp <32 x i64> %op1 to <32 x half> @@ -1638,10 +1636,9 @@ ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a %res = sitofp <16 x i64> %op1 to <16 x half> @@ -1656,10 +1653,9 @@ ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a %res = sitofp <32 x i64> %op1 to <32 x half> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -677,13 +677,11 @@ ; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1h { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <16 x half>, <16 x half>* %a %ptrs = load <16 x half*>, <16 x half*>* %b @@ -700,13 +698,11 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1h { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <32 x half>, <32 x half>* %a %ptrs = load <32 x half*>, <32 x half*>* %b @@ -793,11 +789,10 @@ ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_512-NEXT: punpklo p1.h, p1.b -; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: st1w { z0.d }, p1, [x0] ; VBITS_GE_512-NEXT: ret %cval = load <8 x float>, <8 x float>* %a %ptrs = load <8 x float*>, <8 x float*>* %b @@ -814,11 +809,10 @@ ; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <16 x float>, <16 x float>* %a %ptrs = load <16 x float*>, <16 x float*>* %b @@ -835,11 +829,10 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cval = load <32 x float>, <32 x float>* %a %ptrs = load <32 x float*>, <32 x float*>* %b @@ -993,11 +986,10 @@ ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw #1] +; CHECK-NEXT: st1h { z0.s }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -1056,11 +1048,10 @@ ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw #1] +; CHECK-NEXT: st1h { z0.s }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -1079,11 +1070,10 @@ ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw] +; CHECK-NEXT: st1h { z0.s }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -1103,11 +1093,10 @@ ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw] +; CHECK-NEXT: st1h { z0.s }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -1127,11 +1116,10 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z1.d, lsl #2] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i64>, <32 x i64>* %b @@ -1149,11 +1137,10 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z1.d] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i64>, <32 x i64>* %b @@ -1172,11 +1159,10 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z1.d] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %bases = load <32 x i8*>, <32 x i8*>* %b @@ -1195,11 +1181,10 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d, #4] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d, #4] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %bases = load <32 x i8*>, <32 x i8*>* %b @@ -1242,11 +1227,10 @@ ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: st1w { z0.d }, p1, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %ptrs = load <32 x float*>, <32 x float*>* %b