diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7526,7 +7526,7 @@ if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); - Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } @@ -7578,7 +7578,7 @@ ShuffleOps[1], Mask, DAG); if (!Shuffle) return SDValue(); - return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); } enum ShuffleOpCodes { @@ -12952,6 +12952,28 @@ return SDValue(); } +static SDValue +PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + SDLoc dl(N); + + // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST + if (ST->isLittle()) + return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op); + + // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) + if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { + // If the valuetypes are the same, we can remove the cast entirely. + if (Op->getOperand(0).getValueType() == VT) + return Op->getOperand(0); + return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); + } + + return SDValue(); +} + static SDValue PerformVCMPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -14787,6 +14809,8 @@ return PerformARMBUILD_VECTORCombine(N, DCI); case ARMISD::PREDICATE_CAST: return PerformPREDICATE_CASTCombine(N, DCI); + case ARMISD::VECTOR_REG_CAST: + return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); case ARMISD::VCMP: return PerformVCMPCombine(N, DCI, Subtarget); case ARMISD::SMULWB: { diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -294,6 +294,21 @@ def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>; def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>; +// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a +// vector register as a different vector type, without changing the contents of +// the register. It differs from 'bitconvert' in that bitconvert reinterprets +// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST +// reinterprets the _register_ format - and in big-endian, the memory and +// register formats are different, so they are different operations. +// +// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of +// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness, +// whereas 'bitconvert' will map it to the high byte in big-endian mode, +// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the +// bitconvert would have to emit a VREV16.8 instruction, whereas the +// VECTOR_REG_CAST emits no code at all if the vector is already in a register. +def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3990,21 +3990,6 @@ // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; -// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE -// vector register as a different vector type, without changing the contents of -// the register. It differs from 'bitconvert' in that bitconvert reinterprets -// the _memory_ storage format of the vector, whereas vector_reg_cast -// reinterprets the _register_ format - and in big-endian, the memory and -// register formats are different, so they are different operations. -// -// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of -// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness, -// whereas 'bitconvert' will map it to the high byte in big-endian mode, -// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert -// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits -// no code at all if the vector is already in a register. -def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>; - let Predicates = [HasMVEInt] in { foreach VT = [ v4i1, v8i1, v16i1 ] in { def : Pat<(i32 (predicate_cast (VT VCCR:$src))), @@ -4019,7 +4004,7 @@ foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in - def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>; + def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>; } // end of MVE compares diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -7531,6 +7531,16 @@ def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>; } +let Predicates = [HasNEON] in { + foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>; + + foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in + foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in + def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>; +} + // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian let Predicates = [IsBE,HasNEON] in { def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)), diff --git a/llvm/test/CodeGen/ARM/neon-vmovn.ll b/llvm/test/CodeGen/ARM/neon-vmovn.ll --- a/llvm/test/CodeGen/ARM/neon-vmovn.ll +++ b/llvm/test/CodeGen/ARM/neon-vmovn.ll @@ -732,8 +732,8 @@ ; ; CHECKBE-LABEL: test: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.8 q8, q1 -; CHECKBE-NEXT: vrev64.8 q9, q0 +; CHECKBE-NEXT: vrev64.16 q8, q1 +; CHECKBE-NEXT: vrev64.16 q9, q0 ; CHECKBE-NEXT: vtrn.8 q9, q8 ; CHECKBE-NEXT: vrev64.8 q0, q9 ; CHECKBE-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -10,8 +10,8 @@ ; ; CHECKBE-LABEL: vmovn32_trunc1: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.16 q2, q1 -; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: vrev64.32 q2, q1 +; CHECKBE-NEXT: vrev64.32 q1, q0 ; CHECKBE-NEXT: vmovnt.i32 q1, q2 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr @@ -30,8 +30,8 @@ ; ; CHECKBE-LABEL: vmovn32_trunc2: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.16 q2, q0 -; CHECKBE-NEXT: vrev64.16 q3, q1 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: vrev64.32 q3, q1 ; CHECKBE-NEXT: vmovnt.i32 q3, q2 ; CHECKBE-NEXT: vrev64.16 q0, q3 ; CHECKBE-NEXT: bx lr @@ -49,8 +49,8 @@ ; ; CHECKBE-LABEL: vmovn16_trunc1: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.8 q2, q1 -; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vrev64.16 q1, q0 ; CHECKBE-NEXT: vmovnt.i16 q1, q2 ; CHECKBE-NEXT: vrev64.8 q0, q1 ; CHECKBE-NEXT: bx lr @@ -69,8 +69,8 @@ ; ; CHECKBE-LABEL: vmovn16_trunc2: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.8 q2, q0 -; CHECKBE-NEXT: vrev64.8 q3, q1 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vrev64.16 q3, q1 ; CHECKBE-NEXT: vmovnt.i16 q3, q2 ; CHECKBE-NEXT: vrev64.8 q0, q3 ; CHECKBE-NEXT: bx lr