diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19105,6 +19105,12 @@ SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); + EVT ValueVT = Value.getValueType(); + + auto hasValidElementTypeForFPTruncStore = [](EVT VT) { + EVT EltVT = VT.getVectorElementType(); + return EltVT == MVT::f32 || EltVT == MVT::f64; + }; // If this is an FP_ROUND followed by a store, fold this into a truncating // store. We can do this even if this is already a truncstore. @@ -19113,9 +19119,9 @@ if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND && Value.getNode()->hasOneUse() && ST->isUnindexed() && Subtarget->useSVEForFixedLengthVectors() && - Value.getValueType().isFixedLengthVector() && - Value.getValueType().getFixedSizeInBits() >= - Subtarget->getMinSVEVectorSizeInBits()) + ValueVT.isFixedLengthVector() && + ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() && + hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType())) return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getMemoryVT(), ST->getMemOperand()); @@ -21021,12 +21027,17 @@ if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) return SDValue(); + auto hasValidElementTypeForFPExtLoad = [](EVT VT) { + EVT EltVT = VT.getVectorElementType(); + return EltVT == MVT::f32 || EltVT == MVT::f64; + }; + // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) // We purposefully don't care about legality of the nodes here as we know // they can be split down into something legal. if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() && - VT.isFixedLengthVector() && + VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) && VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Ensure we don't attempt to combine into an extending fp128 load. +define void @fcvt_v4f64_v4f128(ptr %a, ptr %b) vscale_range(2,0) #0 { +; CHECK-LABEL: fcvt_v4f64_v4f128: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: bl __extenddftf2 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload +; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: mov d1, v1.d[1] +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: bl __extenddftf2 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: bl __extenddftf2 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov d1, v1.d[1] +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: bl __extenddftf2 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: stp q1, q0, [x19] +; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q2, [x19, #32] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %res = fpext <4 x double> %op1 to <4 x fp128> + store <4 x fp128> %res, ptr %b + ret void +} + +; Ensure we don't attempt to combine into a truncateing fp128 store. +define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { +; CHECK-LABEL: fcvt_v4f128_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: ldr q1, [x0, #64] +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: ldr q0, [x0, #80] +; CHECK-NEXT: stp q0, q1, [sp, #96] // 32-byte Folded Spill +; CHECK-NEXT: ldr q1, [x0, #96] +; CHECK-NEXT: ldr q0, [x0, #112] +; CHECK-NEXT: stp q0, q1, [sp, #64] // 32-byte Folded Spill +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: stp q0, q1, [sp, #32] // 32-byte Folded Spill +; CHECK-NEXT: ldr q0, [x0, #32] +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [x0, #48] +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: bl __trunctfdf2 +; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ldr z1, [x9] // 16-byte Folded Reload +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: st1d { z0.d }, p0, [x19, x8, lsl #3] +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret + %op1 = load <8 x fp128>, ptr %a + %res = fptrunc <8 x fp128> %op1 to <8 x double> + store <8 x double> %res, ptr %b + ret void +} + +attributes #0 = { nounwind "target-features"="+sve" }