diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1731,16 +1731,14 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl, SDValue Chain) { - unsigned SrcSize = SrcOp.getValueSizeInBits(); - unsigned SlotSize = SlotVT.getSizeInBits(); - unsigned DestSize = DestVT.getSizeInBits(); + EVT SrcVT = SrcOp.getValueType(); Type *DestType = DestVT.getTypeForEVT(*DAG.getContext()); Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType); // Don't convert with stack if the load/store is expensive. - if ((SrcSize > SlotSize && + if ((SrcVT.bitsGT(SlotVT) && !TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) || - (SlotSize < DestSize && + (SlotVT.bitsLT(DestVT) && !TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT))) return SDValue(); @@ -1758,20 +1756,19 @@ // later than DestVT. SDValue Store; - if (SrcSize > SlotSize) + if (SrcVT.bitsGT(SlotVT)) Store = DAG.getTruncStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SlotVT, SrcAlign); else { - assert(SrcSize == SlotSize && "Invalid store"); - Store = - DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign); + assert(SrcVT.bitsEq(SlotVT) && "Invalid store"); + Store = DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign); } // Result is a load from the stack slot. - if (SlotSize == DestSize) + if (SlotVT.bitsEq(DestVT)) return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign); - assert(SlotSize < DestSize && "Unknown extension!"); + assert(SlotVT.bitsLT(DestVT) && "Unknown extension!"); return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT, DestAlign); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3817,6 +3817,14 @@ return LowerFixedLengthBitcastToSVE(Op, DAG); if (OpVT.isScalableVector()) { + // Bitcasting between unpacked vector types of different element counts is + // not a NOP because the live elements are laid out differently. + // 01234567 + // e.g. nxv2i32 = XX??XX?? + // nxv4f16 = X?X?X?X? + if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount()) + return SDValue(); + if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && "Expected int->fp bitcast!"); @@ -19282,6 +19290,15 @@ if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) { assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() && "Expected fp->int bitcast!"); + + // Bitcasting between unpacked vector types of different element counts is + // not a NOP because the live elements are laid out differently. + // 01234567 + // e.g. nxv2i32 = XX??XX?? + // nxv4f16 = X?X?X?X? + if (VT.getVectorElementCount() != SrcVT.getVectorElementCount()) + return; + SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult)); return; @@ -21137,6 +21154,17 @@ EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); + // Safe bitcasting between unpacked vector types of different element counts + // is currently unsupported because the following is missing the necessary + // work to ensure the result's elements live where they're supposed to within + // an SVE register. + // 01234567 + // e.g. nxv2i32 = XX??XX?? + // nxv4f16 = X?X?X?X? + assert((VT.getVectorElementCount() == InVT.getVectorElementCount() || + VT == PackedVT || InVT == PackedInVT) && + "Unexpected bitcast!"); + // Pack input if required. if (InVT != PackedInVT) Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); diff --git a/llvm/test/CodeGen/AArch64/sve-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-bitcast.ll --- a/llvm/test/CodeGen/AArch64/sve-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-bitcast.ll @@ -610,11 +610,17 @@ ret <vscale x 4 x i16> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 4 x i16> @bitcast_nxv2f32_to_nxv4i16(<vscale x 2 x float> %v) #0 { ; CHECK-LABEL: bitcast_nxv2f32_to_nxv4i16: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x i16> ret <vscale x 4 x i16> %bc @@ -664,11 +670,17 @@ ret <vscale x 2 x i32> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 2 x i32> @bitcast_nxv4f16_to_nxv2i32(<vscale x 4 x half> %v) #0 { ; CHECK-LABEL: bitcast_nxv4f16_to_nxv2i32: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x i32> ret <vscale x 2 x i32> %bc @@ -682,11 +694,17 @@ ret <vscale x 2 x i32> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 2 x i32> @bitcast_nxv4bf16_to_nxv2i32(<vscale x 4 x bfloat> %v) #0 { ; CHECK-LABEL: bitcast_nxv4bf16_to_nxv2i32: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x i32> ret <vscale x 2 x i32> %bc @@ -720,21 +738,33 @@ ret <vscale x 4 x half> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 4 x half> @bitcast_nxv2i32_to_nxv4f16(<vscale x 2 x i32> %v) #0 { ; CHECK-LABEL: bitcast_nxv2i32_to_nxv4f16: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x half> ret <vscale x 4 x half> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 4 x half> @bitcast_nxv2f32_to_nxv4f16(<vscale x 2 x float> %v) #0 { ; CHECK-LABEL: bitcast_nxv2f32_to_nxv4f16: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x half> ret <vscale x 4 x half> %bc @@ -768,11 +798,17 @@ ret <vscale x 2 x float> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 2 x float> @bitcast_nxv4i16_to_nxv2f32(<vscale x 4 x i16> %v) #0 { ; CHECK-LABEL: bitcast_nxv4i16_to_nxv2f32: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 4 x i16> %v to <vscale x 2 x float> ret <vscale x 2 x float> %bc @@ -786,21 +822,33 @@ ret <vscale x 2 x float> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 2 x float> @bitcast_nxv4f16_to_nxv2f32(<vscale x 4 x half> %v) #0 { ; CHECK-LABEL: bitcast_nxv4f16_to_nxv2f32: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x float> ret <vscale x 2 x float> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 2 x float> @bitcast_nxv4bf16_to_nxv2f32(<vscale x 4 x bfloat> %v) #0 { ; CHECK-LABEL: bitcast_nxv4bf16_to_nxv2f32: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x float> ret <vscale x 2 x float> %bc @@ -834,11 +882,17 @@ ret <vscale x 4 x bfloat> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 4 x bfloat> @bitcast_nxv2i32_to_nxv4bf16(<vscale x 2 x i32> %v) #0 { ; CHECK-LABEL: bitcast_nxv2i32_to_nxv4bf16: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x bfloat> ret <vscale x 4 x bfloat> %bc @@ -852,11 +906,17 @@ ret <vscale x 4 x bfloat> %bc } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 4 x bfloat> @bitcast_nxv2f32_to_nxv4bf16(<vscale x 2 x float> %v) #0 { ; CHECK-LABEL: bitcast_nxv2f32_to_nxv4bf16: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x bfloat> ret <vscale x 4 x bfloat> %bc @@ -1049,13 +1109,18 @@ ret <vscale x 2 x double> %extended } -; TODO: Invalid code generation because the bitcast must change the in-register -; layout when casting between unpacked scalable vector types. define <vscale x 2 x float> @bitcast_short_half_to_float(<vscale x 4 x half> %v) #0 { ; CHECK-LABEL: bitcast_short_half_to_float: ; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z0.h +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %add = fadd <vscale x 4 x half> %v, %v %bitcast = bitcast <vscale x 4 x half> %add to <vscale x 2 x float>