Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4971,9 +4971,25 @@ return GenerateTBL(Op, ShuffleMask, DAG); } +// +// In little-endian mode, CnstBits and UndefBits will consist of repeated +// copies of the underlying SplatBits and SplatUndef from isConstantSplat(). +// In big-endian mode, the same will be reversed by chunks of lane width. +// +// Example: +// +// little-endian: +// CnstBits -> [0x00010000, 0x00010000] +// UndefBits -> [0x00010000, 0x00010000] +// big-endian: +// CnstBits -> [0x0000, 0x0001, 0x0000, 0x0001] +// UndefBits -> [0x0000, 0x0001, 0x0000, 0x0001] +// static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, - APInt &UndefBits) { + APInt &UndefBits, bool isBigEndian) { EVT VT = BVN->getValueType(0); + CnstBits = APInt(VT.getSizeInBits(), 0); + UndefBits = APInt(VT.getSizeInBits(), 0); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -4985,6 +5001,23 @@ UndefBits <<= SplatBitSize; CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); + } + + // In big endian mode, the underlying lanes are reversed. + if (isBigEndian) { + APInt BeCnstBits(VT.getSizeInBits(), 0), BeUndefBits(VT.getSizeInBits(), 0); + unsigned Sz = BVN->getValueType(0).getVectorElementType().getSizeInBits(); + APInt Mask = APInt::getAllOnesValue(Sz); + Mask = Mask.zextOrTrunc(VT.getSizeInBits()); + for (unsigned I = 0; I < BVN->getValueType(0).getVectorNumElements(); ++I) { + BeCnstBits <<= Sz; + BeUndefBits <<= Sz; + BeCnstBits |= CnstBits.lshr(I * Sz) & Mask; + BeUndefBits |= UndefBits.lshr(I * Sz) & Mask; + } + + CnstBits = BeCnstBits; + UndefBits = BeUndefBits; } return true; @@ -5004,9 +5037,8 @@ if (!BVN) return Op; - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + APInt CnstBits, UndefBits; + if (resolveBuildVector(BVN, CnstBits, UndefBits, !Subtarget->isLittleEndian())) { // We only have BIC vector immediate instruction, which is and-not. CnstBits = ~CnstBits; @@ -5207,9 +5239,8 @@ if (!BVN) return Op; - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + APInt CnstBits, UndefBits; + if (resolveBuildVector(BVN, CnstBits, UndefBits, !Subtarget->isLittleEndian())) { // We make use of a little bit of goto ickiness in order to avoid having to // duplicate the immediate matching logic for the undef toggled case. bool SecondTry = false; @@ -5318,9 +5349,8 @@ Op = NormalizeBuildVector(Op, DAG); BuildVectorSDNode *BVN = cast(Op.getNode()); - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + APInt CnstBits, UndefBits; + if (resolveBuildVector(BVN, CnstBits, UndefBits, !Subtarget->isLittleEndian())) { // We make use of a little bit of goto ickiness in order to avoid having to // duplicate the immediate matching logic for the undef toggled case. bool SecondTry = false; @@ -5928,13 +5958,13 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, - SDLoc dl, SelectionDAG &DAG) { + SDLoc dl, SelectionDAG &DAG, + bool isBigEndian) { EVT SrcVT = LHS.getValueType(); BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); + APInt CnstBits, UndefBits; + bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits, isBigEndian); bool IsZero = IsCnst && (CnstBits == 0); if (SrcVT.getVectorElementType().isFloatingPoint()) { @@ -6030,7 +6060,7 @@ assert(LHS.getValueType() == RHS.getValueType()); AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(), - dl, DAG); + dl, DAG, !Subtarget->isLittleEndian()); } assert(LHS.getValueType().getVectorElementType() == MVT::f32 || @@ -6044,13 +6074,15 @@ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; SDValue Cmp = - EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG); + EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG, + !Subtarget->isLittleEndian()); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { SDValue Cmp2 = - EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG); + EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG, + !Subtarget->isLittleEndian()); if (!Cmp2.getNode()) return SDValue(); Index: test/CodeGen/AArch64/aarch64-big-endian-movi.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-big-endian-movi.ll @@ -0,0 +1,227 @@ +; RUN: llc -mtriple=aarch64_be--linux-gnu < %s | FileCheck %s + +; CHECK-LABLE: f: +define i16 @f(<4 x i16> %arg) nounwind { + ; CHECK: rev64 v[[REG:[0-9]+]].4h, v[[REG]].4h + ; CHECK-NEXT umov w{{[0-9]+}}, v[[REG]].h[0] + ; CHECK-NEXT ret + %v = extractelement <4 x i16> %arg, i32 0 + ret i16 %v +} + +; CHECK-LABLE: g: +define i16 @g(<8 x i16> %arg) nounwind { + ; CHECK: rev64 v[[REG:[0-9]+]].8h, v[[REG]].8h + ; CHECK-NEXT umov w{{[0-9]+}}, v[[REG]].h[0] + ; CHECK-NEXT ret + %v = extractelement <8 x i16> %arg, i32 0 + ret i16 %v +} + +; CHECK-LABEL: symmetric: +define i32 @symmetric() { + ; #### ModImmType1 #### + ; CHECK: movi v[[REG:[0-9]+]].2s, #0x1, lsl #16 + ; CHECK-NEXT: rev64 v[[REG]].2s, v[[REG]].2s + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].4s, #0x1, lsl #16 + ; CHECK-NEXT: rev64 v[[REG]].4s, v[[REG]].4s + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ; #### ModImmType2 #### + ; CHECK: movi v[[REG:[0-9]+]].2s, #0x1, lsl #24 + ; CHECK-NEXT: rev64 v[[REG]].2s, v[[REG]].2s + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].4s, #0x1, lsl #24 + ; CHECK-NEXT: rev64 v[[REG]].4s, v[[REG]].4s + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ; #### ModImmType3 #### + ; CHECK: movi v[[REG:[0-9]+]].2s, #0x1 + ; CHECK-NEXT: rev64 v[[REG]].2s, v[[REG]].2s + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].4s, #0x1 + ; CHECK-NEXT: rev64 v[[REG]].4s, v[[REG]].4s + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ; #### ModImmType4 ##### + ; CHECK: movi v[[REG:[0-9]+]].2s, #0x1, lsl #8 + ; CHECK-NEXT: rev64 v[[REG]].2s, v[[REG]].2s + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].4s, #0x1, lsl #8 + ; CHECK-NEXT: rev64 v[[REG]].4s, v[[REG]].4s + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ; #### ModImmType5 #### + ; CHECK: movi v[[REG:[0-9]+]].4h, #0x1 + ; CHECK-NEXT: rev64 v[[REG]].4h, v[[REG]].4h + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].8h, #0x1 + ; CHECK-NEXT: rev64 v[[REG]].8h, v[[REG]].8h + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ; #### ModImmType6 #### + ; CHECK: movi v[[REG:[0-9]+]].4h, #0x1, lsl #8 + ; CHECK-NEXT: rev64 v[[REG]].4h, v[[REG]].4h + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].8h, #0x1, lsl #8 + ; CHECK-NEXT: rev64 v[[REG]].8h, v[[REG]].8h + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ; #### ModImmType9 #### + ; CHECK: movi v[[REG:[0-9]+]].8b, #0x1 + ; CHECK-NEXT: rev64 v[[REG]].8b, v[[REG]].8b + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].16b, #0x1 + ; CHECK-NEXT: rev64 v[[REG]].16b, v[[REG]].16b + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ; #### ModImmType10 #### + ; CHECK: movi d{{[0-9]+}}, #0xffff0000ffff0000 + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + ; CHECK: movi v[[REG:[0-9]+]].2d, #0xffff0000ffff0000 + ; CHECK-NEXT: ext v[[REG]].16b, v[[REG]].16b, v[[REG]].16b, #8 + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ret i32 0 +} + +; #### ModImmType7 #### +define i32 @ModImmType7() { + ; CHECK: [[LBL1:LCPI...]]: + ; CHECK-NEXT: .hword 511 // 0x1ff + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 511 // 0x1ff + ; CHECK-NEXT: .hword 0 // 0x0 + + ; CHECK: [[LBL2:LCPI...]]: + ; CHECK-NEXT: .hword 511 // 0x1ff + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 511 // 0x1ff + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 511 // 0x1ff + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 511 // 0x1ff + ; CHECK-NEXT: .hword 0 // 0x0 + + ; CHECK-LABEL: ModImmType7 + ; CHECK: adrp x[[REG:[0-9]+]], .[[LBL1]] + ; CHECK-NEXT: ldr d{{[0-9]+}}, [x[[REG]], :lo12:.[[LBL1]]] + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + + ; CHECK: adrp x[[REG:[0-9]+]], .[[LBL2]] + ; CHECK-NEXT: ldr q{{[0-9]+}}, [x[[REG]], :lo12:.[[LBL2]]] + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ret i32 0 +} + +; #### ModImmType8 #### +define i32 @ModImmType8() { + ; CHECK: [[LBL1:LCPI...]]: + ; CHECK-NEXT: .hword 65535 // 0xffff + ; CHECK-NEXT: .hword 1 // 0x1 + ; CHECK-NEXT: .hword 65535 // 0xffff + ; CHECK-NEXT: .hword 1 // 0x1 + + ; CHECK: [[LBL2:LCPI...]]: + ; CHECK-NEXT: .hword 65535 // 0xffff + ; CHECK-NEXT: .hword 1 // 0x1 + ; CHECK-NEXT: .hword 65535 // 0xffff + ; CHECK-NEXT: .hword 1 // 0x1 + ; CHECK-NEXT: .hword 65535 // 0xffff + ; CHECK-NEXT: .hword 1 // 0x1 + ; CHECK-NEXT: .hword 65535 // 0xffff + ; CHECK-NEXT: .hword 1 // 0x1 + + ; CHECK-LABEL: ModImmType8 + ; CHECK: adrp x[[REG:[0-9]+]], .[[LBL1]] + ; CHECK-NEXT: ldr d{{[0-9]+}}, [x[[REG]], :lo12:.[[LBL1]]] + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + + ; CHECK: adrp x[[REG:[0-9]+]], .[[LBL2]] + ; CHECK-NEXT: ldr q{{[0-9]+}}, [x[[REG]], :lo12:.[[LBL2]]] + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ret i32 0 +} + +; #### ModImmType11 #### +define i32 @ModImmType11() { + ; CHECK: [[LBL1:LCPI...]]: + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16392 // 0x4008 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16392 // 0x4008 + + ; CHECK: [[LBL2:LCPI...]]: + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16392 // 0x4008 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16392 // 0x4008 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16392 // 0x4008 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16392 // 0x4008 + + ; CHECK-LABEL: ModImmType11 + ; CHECK: adrp x[[REG:[0-9]+]], .[[LBL1]] + ; CHECK-NEXT: ldr d{{[0-9]+}}, [x[[REG]], :lo12:.[[LBL1]]] + ; CHECK-NEXT: bl f + call i16 @f(<4 x i16> ) + + ; CHECK: adrp x[[REG:[0-9]+]], .[[LBL2]] + ; CHECK-NEXT: ldr q{{[0-9]+}}, [x[[REG]], :lo12:.[[LBL2]]] + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ret i32 0 +} + +; #### ModImmType12 #### +define i32 @ModImmType12() { + ; CHECK: [[LBL2:LCPI...]]: + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16384 // 0x4000 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 0 // 0x0 + ; CHECK-NEXT: .hword 16384 // 0x4000 + + ; CHECK-LABEL: ModImmType12 + ; CHECK: adrp x[[REG:[0-9]+]], .[[LBL2]] + ; CHECK-NEXT: ldr q{{[0-9]+}}, [x[[REG]], :lo12:.[[LBL2]]] + ; CHECK-NEXT: bl g + call i16 @g(<8 x i16> ) + + ret i32 0 +}