diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15985,7 +15985,9 @@ if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND && Value.getNode()->hasOneUse() && ST->isUnindexed() && Subtarget->useSVEForFixedLengthVectors() && - Value.getValueType().isFixedLengthVector()) + Value.getValueType().isFixedLengthVector() && + Value.getValueType().getFixedSizeInBits() > + Subtarget->getMinSVEVectorSizeInBits()) return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getMemoryVT(), ST->getMemOperand()); @@ -17346,7 +17348,8 @@ // they can be split down into something legal. if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() && - VT.isFixedLengthVector()) { + VT.isFixedLengthVector() && + VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -25,24 +25,31 @@ ; ; Don't use SVE for 64-bit vectors. -define <2 x float> @fcvt_v2f16_v2f32(<2 x half> %op1) #0 { +define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 { ; CHECK-LABEL: fcvt_v2f16_v2f32: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret + %op1 = load <2 x half>, <2 x half>* %a %res = fpext <2 x half> %op1 to <2 x float> - ret <2 x float> %res + store <2 x float> %res, <2 x float>* %b + ret void } ; Don't use SVE for 128-bit vectors. -define <4 x float> @fcvt_v4f16_v4f32(<4 x half> %op1) #0 { +define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 { ; CHECK-LABEL: fcvt_v4f16_v4f32: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret + %op1 = load <4 x half>, <4 x half>* %a %res = fpext <4 x half> %op1 to <4 x float> - ret <4 x float> %res + store <4 x float> %res, <4 x float>* %b + ret void } define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 { @@ -120,28 +127,34 @@ ; ; Don't use SVE for 64-bit vectors. -define <1 x double> @fcvt_v1f16_v1f64(<1 x half> %op1) #0 { +define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 { ; CHECK-LABEL: fcvt_v1f16_v1f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: fcvt d0, h0 +; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret + %op1 = load <1 x half>, <1 x half>* %a %res = fpext <1 x half> %op1 to <1 x double> - ret <1 x double> %res + store <1 x double> %res, <1 x double>* %b + ret void } ; v2f16 is not legal for NEON, so use SVE -define <2 x double> @fcvt_v2f16_v2f64(<2 x half> %op1) #0 { +define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 { ; CHECK-LABEL: fcvt_v2f16_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvt z0.d, p0/m, z0.h -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret + %op1 = load <2 x half>, <2 x half>* %a %res = fpext <2 x half> %op1 to <2 x double> - ret <2 x double> %res + store <2 x double> %res, <2 x double>* %b + ret void } define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 { @@ -218,24 +231,31 @@ ; ; Don't use SVE for 64-bit vectors. -define <1 x double> @fcvt_v1f32_v1f64(<1 x float> %op1) #0 { +define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 { ; CHECK-LABEL: fcvt_v1f32_v1f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret + %op1 = load <1 x float>, <1 x float>* %a %res = fpext <1 x float> %op1 to <1 x double> - ret <1 x double> %res + store <1 x double> %res, <1 x double>* %b + ret void } ; Don't use SVE for 128-bit vectors. -define <2 x double> @fcvt_v2f32_v2f64(<2 x float> %op1) #0 { +define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 { ; CHECK-LABEL: fcvt_v2f32_v2f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: fcvtl v0.2d, v0.2s +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret + %op1 = load <2 x float>, <2 x float>* %a %res = fpext <2 x float> %op1 to <2 x double> - ret <2 x double> %res + store <2 x double> %res, <2 x double>* %b + ret void } define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 { @@ -273,7 +293,6 @@ ; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret - %op1 = load <8 x float>, <8 x float>* %a %res = fpext <8 x float> %op1 to <8 x double> store <8 x double> %res, <8 x double>* %b @@ -313,39 +332,45 @@ ; ; Don't use SVE for 64-bit vectors. -define <2 x half> @fcvt_v2f32_v2f16(<2 x float> %op1) #0 { +define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 { ; CHECK-LABEL: fcvt_v2f32_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: str s0, [x1] ; CHECK-NEXT: ret + %op1 = load <2 x float>, <2 x float>* %a %res = fptrunc <2 x float> %op1 to <2 x half> - ret <2 x half> %res + store <2 x half> %res, <2 x half>* %b + ret void } ; Don't use SVE for 128-bit vectors. -define <4 x half> @fcvt_v4f32_v4f16(<4 x float> %op1) #0 { +define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 { ; CHECK-LABEL: fcvt_v4f32_v4f16: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret + %op1 = load <4 x float>, <4 x float>* %a %res = fptrunc <4 x float> %op1 to <4 x half> - ret <4 x half> %res + store <4 x half> %res, <4 x half>* %b + ret void } -define <8 x half> @fcvt_v8f32_v8f16(<8 x float>* %a) #0 { +define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 { ; CHECK-LABEL: fcvt_v8f32_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a %res = fptrunc <8 x float> %op1 to <8 x half> - ret <8 x half> %res + store <8 x half> %res, <8 x half>* %b + ret void } define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 { @@ -356,10 +381,15 @@ ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s ; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.s ; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.s -; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1] +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: splice z1.h, p0, z1.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16: @@ -408,47 +438,51 @@ ; ; Don't use SVE for 64-bit vectors. -define <1 x half> @fcvt_v1f64_v1f16(<1 x double> %op1) #0 { +define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 { ; CHECK-LABEL: fcvt_v1f64_v1f16: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: str h0, [x1] ; CHECK-NEXT: ret + %op1 = load <1 x double>, <1 x double>* %a %res = fptrunc <1 x double> %op1 to <1 x half> - ret <1 x half> %res + store <1 x half> %res, <1 x half>* %b + ret void } ; v2f16 is not legal for NEON, so use SVE -define <2 x half> @fcvt_v2f64_v2f16(<2 x double> %op1) #0 { +define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 { ; CHECK-LABEL: fcvt_v2f64_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str s0, [x1] ; CHECK-NEXT: ret + %op1 = load <2 x double>, <2 x double>* %a %res = fptrunc <2 x double> %op1 to <2 x half> - ret <2 x half> %res + store <2 x half> %res, <2 x half>* %b + ret void } -define <4 x half> @fcvt_v4f64_v4f16(<4 x double>* %a) #0 { +define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 { ; CHECK-LABEL: fcvt_v4f64_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %res = fptrunc <4 x double> %op1 to <4 x half> - ret <4 x half> %res + store <4 x half> %res, <4 x half>* %b + ret void } -define <8 x half> @fcvt_v8f64_v8f16(<8 x double>* %a) #0 { +define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 { ; Ensure sensible type legalisation ; VBITS_EQ_256-LABEL: fcvt_v8f64_v8f16: ; VBITS_EQ_256: // %bb.0: @@ -461,25 +495,23 @@ ; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.d ; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z2.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z0.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: mov v0.d[1], v2.d[0] -; VBITS_EQ_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: str q1, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.d -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptrunc <8 x double> %op1 to <8 x half> - ret <8 x half> %res + store <8 x half> %res, <8 x half>* %b + ret void } define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 { @@ -515,39 +547,42 @@ ; ; Don't use SVE for 64-bit vectors. -define <1 x float> @fcvt_v1f64_v1f32(<1 x double> %op1) #0 { +define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 { ; CHECK-LABEL: fcvt_v1f64_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret %res = fptrunc <1 x double> %op1 to <1 x float> - ret <1 x float> %res + store <1 x float> %res, <1 x float>* %b + ret void } ; Don't use SVE for 128-bit vectors. -define <2 x float> @fcvt_v2f64_v2f32(<2 x double> %op1) #0 { +define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 { ; CHECK-LABEL: fcvt_v2f64_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %res = fptrunc <2 x double> %op1 to <2 x float> - ret <2 x float> %res + store <2 x float> %res, <2 x float>* %b + ret void } -define <4 x float> @fcvt_v4f64_v4f32(<4 x double>* %a) #0 { +define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 { ; CHECK-LABEL: fcvt_v4f64_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: st1w { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %res = fptrunc <4 x double> %op1 to <4 x float> - ret <4 x float> %res + store <4 x float> %res, <4 x float>* %b + ret void } define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 { @@ -558,10 +593,15 @@ ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d ; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.d ; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.d -; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32: