diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9939,30 +9939,34 @@ Swap = true; } + // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill + // out of range values with 0s. We do need to make sure that any out-of-range + // values are really out-of-range for a v16i8 vector. + bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode()); + MVT IndexVT = MVT::v8i8; + unsigned IndexLen = 8; + if (Op.getValueSizeInBits() == 128) { + IndexVT = MVT::v16i8; + IndexLen = 16; + } + SmallVector TBLMask; for (int Val : ShuffleMask) { for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; if (Swap) - Offset = Offset < 16 ? Offset + 16 : Offset - 16; + Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen; + if (IsUndefOrZero && Offset >= IndexLen) + Offset = 255; TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); } } - MVT IndexVT = MVT::v8i8; - unsigned IndexLen = 8; - if (Op.getValueSizeInBits() == 128) { - IndexVT = MVT::v16i8; - IndexLen = 16; - } - SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; - // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill - // out of range values with 0s. - if (V2.isUndef() || isZerosVector(V2.getNode())) { + if (IsUndefOrZero) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -909,9 +909,9 @@ ; CHECK-LABEL: .LCPI90_0: ; CHECK-NEXT: .byte 0 -; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 255 ; CHECK-NEXT: .byte 2 -; CHECK-NEXT: .byte 9 +; CHECK-NEXT: .byte 255 ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 5 ; CHECK-NEXT: .byte 6 @@ -930,14 +930,14 @@ } ; CHECK-LABEL: .LCPI91_0: -; CHECK-NEXT: .byte 24 -; CHECK-NEXT: .byte 16 -; CHECK-NEXT: .byte 26 -; CHECK-NEXT: .byte 17 -; CHECK-NEXT: .byte 28 -; CHECK-NEXT: .byte 29 -; CHECK-NEXT: .byte 30 -; CHECK-NEXT: .byte 31 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 7 define <8 x i8> @vselect_equivalent_shuffle_v8i8_zeroswap(<8 x i8> %a) { ; CHECK-LABEL: vselect_equivalent_shuffle_v8i8_zeroswap: ; CHECK: // %bb.0: @@ -984,12 +984,12 @@ ; CHECK-LABEL: .LCPI93_0: ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 16 -; CHECK-NEXT: .byte 17 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 5 -; CHECK-NEXT: .byte 18 -; CHECK-NEXT: .byte 19 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 ; CHECK-NEXT: .byte 8 ; CHECK-NEXT: .byte 9 ; CHECK-NEXT: .byte 10 @@ -1011,12 +1011,12 @@ ; CHECK: .byte 0 ; CHECK: .byte 1 -; CHECK: .byte 16 -; CHECK: .byte 17 +; CHECK: .byte 255 +; CHECK: .byte 255 ; CHECK: .byte 4 ; CHECK: .byte 5 -; CHECK: .byte 18 -; CHECK: .byte 19 +; CHECK: .byte 255 +; CHECK: .byte 255 ; CHECK: .byte 8 ; CHECK: .byte 9 ; CHECK: .byte 10