Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -5288,6 +5288,7 @@ OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); + bool IsBigEndian = DAG.getTargetLoweringInfo().isBigEndian(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); @@ -5317,15 +5318,18 @@ case OP_VUZPL: case OP_VUZPR: return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); + OpLHS, OpRHS).getValue(IsBigEndian ? 1-OpNum+OP_VUZPL : + OpNum-OP_VUZPL); case OP_VZIPL: case OP_VZIPR: return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); + OpLHS, OpRHS).getValue(IsBigEndian ? 1-OpNum+OP_VZIPL : + OpNum-OP_VZIPL); case OP_VTRNL: case OP_VTRNR: return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); + OpLHS, OpRHS).getValue(IsBigEndian ? 1-OpNum+OP_VTRNL : + OpNum-OP_VTRNL); } } @@ -5439,25 +5443,26 @@ // these operations, DAG memoization will ensure that a single node is // used for both shuffles. unsigned WhichResult; + unsigned SwapResult = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; if (isVTRNMask(ShuffleMask, VT, WhichResult)) return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); + V1, V2).getValue(WhichResult^SwapResult); if (isVUZPMask(ShuffleMask, VT, WhichResult)) return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); + V1, V2).getValue(WhichResult^SwapResult); if (isVZIPMask(ShuffleMask, VT, WhichResult)) return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); + V1, V2).getValue(WhichResult^SwapResult); if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + V1, V1).getValue(WhichResult^SwapResult); if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + V1, V1).getValue(WhichResult^SwapResult); if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + V1, V1).getValue(WhichResult^SwapResult); } // If the shuffle is not directly supported and it has 4 elements, use Index: test/CodeGen/ARM/big-endian-neon-narrow.ll =================================================================== --- test/CodeGen/ARM/big-endian-neon-narrow.ll +++ test/CodeGen/ARM/big-endian-neon-narrow.ll @@ -0,0 +1,104 @@ +; RUN: llc < %s -mtriple armeb-eabi -mattr v7,neon -o - | FileCheck %s + +define void @vector_narrow_2i64_to_2i32( <2 x i64>* %loadaddr, <2 x i32>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_2i64_to_2i32: +; CHECK: vmovn.i64 [[REG:d[0-9]+]] +; CHECK: vrev64.32 [[REG]], [[REG]] + %1 = load <2 x i64>* %loadaddr + %2 = trunc <2 x i64> %1 to <2 x i32> + store <2 x i32> %2, <2 x i32>* %storeaddr + ret void +} + +define void @vector_narrow_2i64_to_2i16( <2 x i64>* %loadaddr, <2 x i16>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_2i64_to_2i16: +; CHECK: vmovn.i64 [[REG:d[0-9]+]] +; CHECK: vrev32.16 [[REG]], [[REG]] +; CHECK: vuzp.16 [[REG]], [[REG2:d[0-9]+]] +; CHECK: vrev32.16 [[REG]], [[REG2]] + %1 = load <2 x i64>* %loadaddr + %2 = trunc <2 x i64> %1 to <2 x i16> + store <2 x i16> %2, <2 x i16>* %storeaddr + ret void +} + +define void @vector_narrow_2i64_to_2i8( <2 x i64>* %loadaddr, <2 x i8>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_2i64_to_2i8: +; CHECK: vmovn.i64 [[REG:d[0-9]+]] +; CHECK: vmov.32 r0, [[REG:d[0-9]+]][1] +; CHECK: strb r0, [r1, #1] +; CHECK: vmov.32 r0, [[REG:d[0-9]+]][0] +; CHECK: strb r0, [r1] + %1 = load <2 x i64>* %loadaddr + %2 = trunc <2 x i64> %1 to <2 x i8> + store <2 x i8> %2, <2 x i8>* %storeaddr + ret void +} + +define void @vector_narrow_4i32_to_4i16( <4 x i32>* %loadaddr, <4 x i16>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_4i32_to_4i16: +; CHECK: vmovn.i32 [[REG:d[0-9]+]] +; CHECK: vrev64.16 [[REG]], [[REG]] + %1 = load <4 x i32>* %loadaddr + %2 = trunc <4 x i32> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* %storeaddr + ret void +} + +define void @vector_narrow_4i32_to_4i8( <4 x i32>* %loadaddr, <4 x i8>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_4i32_to_4i8: +; CHECK: vmovn.i32 [[REG:d[0-9]+]] +; CHECK: vrev16.8 [[REG]], [[REG]] +; CHECK: vuzp.8 [[REG]], [[REG2:d[0-9]+]] +; CHECK: vrev32.8 [[REG]], [[REG2]] + %1 = load <4 x i32>* %loadaddr + %2 = trunc <4 x i32> %1 to <4 x i8> + store <4 x i8> %2, <4 x i8>* %storeaddr + ret void +} + +define void @vector_narrow_8i16_to_8i8( <8 x i16>* %loadaddr, <8 x i8>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_8i16_to_8i8: +; CHECK: vmovn.i16 [[REG:d[0-9]+]] +; CHECK: vrev64.8 [[REG]], [[REG]] + %1 = load <8 x i16>* %loadaddr + %2 = trunc <8 x i16> %1 to <8 x i8> + store <8 x i8> %2, <8 x i8>* %storeaddr + ret void +} + +define void @vector_narrow_2i32_to_2i16( <2 x i32>* %loadaddr, <2 x i16>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_2i32_to_2i16: +; CHECK: vuzp.16 [[REG:d[0-9]+]], [[REG2:d[0-9]+]] +; CHECK: vrev32.16 [[REG]], [[REG2]] + %1 = load <2 x i32>* %loadaddr + %2 = trunc <2 x i32> %1 to <2 x i16> + store <2 x i16> %2, <2 x i16>* %storeaddr + ret void +} + +define void @vector_narrow_2i32_to_2i8( <2 x i32>* %loadaddr, <2 x i8>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_2i32_to_2i8: +; CHECK: vrev64.32 [[REG:d[0-9]+]] +; CHECK: vmov.32 r0, [[REG:d[0-9]+]][1] +; CHECK: strb r0, [r1, #1] +; CHECK: vmov.32 r0, [[REG]][0] +; CHECK: strb r0, [r1] + %1 = load <2 x i32>* %loadaddr + %2 = trunc <2 x i32> %1 to <2 x i8> + store <2 x i8> %2, <2 x i8>* %storeaddr + ret void +} + +define void @vector_narrow_2i16_to_2i8( <2 x i16>* %loadaddr, <2 x i8>* %storeaddr ) { +; CHECK-LABEL: vector_narrow_2i16_to_2i8: +; CHECK: vmov.32 r0, [[REG:d[0-9]+]][1] +; CHECK: strb r0, [r1, #1] +; CHECK: vmov.32 r0, [[REG]][0] +; CHECK: strb r0, [r1] + %1 = load <2 x i16>* %loadaddr + %2 = trunc <2 x i16> %1 to <2 x i8> + store <2 x i8> %2, <2 x i8>* %storeaddr + ret void +} +