Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9707,6 +9707,11 @@ unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); + + // This optimization expects NEON vectors. Bail out on SVE vectors. + if (Extract.getOperand(0).getValueSizeInBits() > 128) + return false; + if (ExtIdxInBits % CastedEltBitWidth != 0) return false; Index: llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll @@ -1,22 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s +; RUN: llc -mattr=+sve -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define <4 x i32> @test(<16 x i32>* %arg1, <16 x i32>* %arg2) { ; CHECK-LABEL: test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, #8 -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: add z2.s, p0/m, z2.s, z2.s -; CHECK-NEXT: ext z0.b, z0.b, z1.b, #16 +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #48 ; CHECK-NEXT: add z1.s, p0/m, z1.s, z1.s ; CHECK-NEXT: dup v0.4s, v0.s[2] -; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret entry: %0 = load <16 x i32>, <16 x i32>* %arg1, align 256 @@ -30,17 +26,13 @@ define <2 x i32> @test2(<16 x i32>* %arg1, <16 x i32>* %arg2) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, #8 -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: add z2.s, p0/m, z2.s, z2.s -; CHECK-NEXT: ext z0.b, z0.b, z1.b, #24 +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #56 ; CHECK-NEXT: add z1.s, p0/m, z1.s, z1.s ; CHECK-NEXT: dup v0.2s, v0.s[0] -; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret entry: %0 = load <16 x i32>, <16 x i32>* %arg1, align 256 @@ -50,3 +42,239 @@ store <16 x i32> %2, <16 x i32>* %arg1, align 256 ret <2 x i32> %shvec } + +define <32 x i64> @test3(<32 x i1>* %arg1, <32 x i1>* %arg2, <32 x i64>* %arg3) { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: // %L.entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #80 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: lsr w10, w9, #31 +; CHECK-NEXT: ubfx w11, w9, #30, #1 +; CHECK-NEXT: ubfx w12, w9, #29, #1 +; CHECK-NEXT: strb w10, [sp, #31] +; CHECK-NEXT: ubfx w10, w9, #28, #1 +; CHECK-NEXT: strb w11, [sp, #30] +; CHECK-NEXT: ubfx w11, w9, #27, #1 +; CHECK-NEXT: strb w12, [sp, #29] +; CHECK-NEXT: ubfx w12, w9, #26, #1 +; CHECK-NEXT: strb w10, [sp, #28] +; CHECK-NEXT: ubfx w10, w9, #25, #1 +; CHECK-NEXT: strb w11, [sp, #27] +; CHECK-NEXT: ubfx w11, w9, #24, #1 +; CHECK-NEXT: strb w12, [sp, #26] +; CHECK-NEXT: ubfx w12, w9, #23, #1 +; CHECK-NEXT: strb w10, [sp, #25] +; CHECK-NEXT: ubfx w10, w9, #22, #1 +; CHECK-NEXT: strb w11, [sp, #24] +; CHECK-NEXT: ubfx w11, w9, #21, #1 +; CHECK-NEXT: strb w12, [sp, #23] +; CHECK-NEXT: ubfx w12, w9, #20, #1 +; CHECK-NEXT: strb w10, [sp, #22] +; CHECK-NEXT: ubfx w10, w9, #19, #1 +; CHECK-NEXT: strb w11, [sp, #21] +; CHECK-NEXT: ubfx w11, w9, #18, #1 +; CHECK-NEXT: strb w12, [sp, #20] +; CHECK-NEXT: ubfx w12, w9, #17, #1 +; CHECK-NEXT: strb w10, [sp, #19] +; CHECK-NEXT: ubfx w10, w9, #16, #1 +; CHECK-NEXT: strb w11, [sp, #18] +; CHECK-NEXT: ubfx w11, w9, #15, #1 +; CHECK-NEXT: strb w12, [sp, #17] +; CHECK-NEXT: ubfx w12, w9, #14, #1 +; CHECK-NEXT: strb w10, [sp, #16] +; CHECK-NEXT: ubfx w10, w9, #13, #1 +; CHECK-NEXT: strb w11, [sp, #15] +; CHECK-NEXT: ubfx w11, w9, #12, #1 +; CHECK-NEXT: strb w12, [sp, #14] +; CHECK-NEXT: ubfx w12, w9, #11, #1 +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: ubfx w10, w9, #10, #1 +; CHECK-NEXT: strb w11, [sp, #12] +; CHECK-NEXT: ubfx w11, w9, #9, #1 +; CHECK-NEXT: strb w12, [sp, #11] +; CHECK-NEXT: ubfx w12, w9, #8, #1 +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: ubfx w10, w9, #7, #1 +; CHECK-NEXT: strb w11, [sp, #9] +; CHECK-NEXT: ubfx w11, w9, #6, #1 +; CHECK-NEXT: strb w12, [sp, #8] +; CHECK-NEXT: ubfx w12, w9, #5, #1 +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: ubfx w10, w9, #4, #1 +; CHECK-NEXT: strb w11, [sp, #6] +; CHECK-NEXT: ubfx w11, w9, #3, #1 +; CHECK-NEXT: strb w12, [sp, #5] +; CHECK-NEXT: ubfx w12, w9, #2, #1 +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: ubfx w10, w9, #1, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: strb w11, [sp, #3] +; CHECK-NEXT: strb w12, [sp, #2] +; CHECK-NEXT: strb w10, [sp, #1] +; CHECK-NEXT: strb w9, [sp] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: lsr w10, w9, #31 +; CHECK-NEXT: ubfx w11, w9, #30, #1 +; CHECK-NEXT: ubfx w12, w9, #29, #1 +; CHECK-NEXT: strb w10, [sp, #63] +; CHECK-NEXT: ubfx w10, w9, #28, #1 +; CHECK-NEXT: strb w11, [sp, #62] +; CHECK-NEXT: ubfx w11, w9, #27, #1 +; CHECK-NEXT: strb w12, [sp, #61] +; CHECK-NEXT: ubfx w12, w9, #26, #1 +; CHECK-NEXT: strb w10, [sp, #60] +; CHECK-NEXT: ubfx w10, w9, #25, #1 +; CHECK-NEXT: strb w11, [sp, #59] +; CHECK-NEXT: ubfx w11, w9, #24, #1 +; CHECK-NEXT: strb w12, [sp, #58] +; CHECK-NEXT: ubfx w12, w9, #23, #1 +; CHECK-NEXT: strb w10, [sp, #57] +; CHECK-NEXT: ubfx w10, w9, #22, #1 +; CHECK-NEXT: strb w11, [sp, #56] +; CHECK-NEXT: ubfx w11, w9, #21, #1 +; CHECK-NEXT: strb w12, [sp, #55] +; CHECK-NEXT: ubfx w12, w9, #20, #1 +; CHECK-NEXT: strb w10, [sp, #54] +; CHECK-NEXT: ubfx w10, w9, #19, #1 +; CHECK-NEXT: strb w11, [sp, #53] +; CHECK-NEXT: ubfx w11, w9, #18, #1 +; CHECK-NEXT: strb w12, [sp, #52] +; CHECK-NEXT: ubfx w12, w9, #17, #1 +; CHECK-NEXT: strb w10, [sp, #51] +; CHECK-NEXT: ubfx w10, w9, #16, #1 +; CHECK-NEXT: strb w11, [sp, #50] +; CHECK-NEXT: ubfx w11, w9, #15, #1 +; CHECK-NEXT: strb w12, [sp, #49] +; CHECK-NEXT: ubfx w12, w9, #14, #1 +; CHECK-NEXT: strb w10, [sp, #48] +; CHECK-NEXT: ubfx w10, w9, #13, #1 +; CHECK-NEXT: strb w11, [sp, #47] +; CHECK-NEXT: ubfx w11, w9, #12, #1 +; CHECK-NEXT: strb w12, [sp, #46] +; CHECK-NEXT: ubfx w12, w9, #11, #1 +; CHECK-NEXT: strb w10, [sp, #45] +; CHECK-NEXT: ubfx w10, w9, #10, #1 +; CHECK-NEXT: strb w11, [sp, #44] +; CHECK-NEXT: ubfx w11, w9, #9, #1 +; CHECK-NEXT: strb w12, [sp, #43] +; CHECK-NEXT: ubfx w12, w9, #8, #1 +; CHECK-NEXT: strb w10, [sp, #42] +; CHECK-NEXT: ubfx w10, w9, #7, #1 +; CHECK-NEXT: strb w11, [sp, #41] +; CHECK-NEXT: ubfx w11, w9, #6, #1 +; CHECK-NEXT: strb w12, [sp, #40] +; CHECK-NEXT: ubfx w12, w9, #5, #1 +; CHECK-NEXT: strb w10, [sp, #39] +; CHECK-NEXT: ubfx w10, w9, #4, #1 +; CHECK-NEXT: strb w11, [sp, #38] +; CHECK-NEXT: ubfx w11, w9, #3, #1 +; CHECK-NEXT: strb w12, [sp, #37] +; CHECK-NEXT: ubfx w12, w9, #2, #1 +; CHECK-NEXT: strb w10, [sp, #36] +; CHECK-NEXT: ubfx w10, w9, #1, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: strb w11, [sp, #35] +; CHECK-NEXT: strb w12, [sp, #34] +; CHECK-NEXT: mov x12, #8 +; CHECK-NEXT: strb w10, [sp, #33] +; CHECK-NEXT: strb w9, [sp, #32] +; CHECK-NEXT: add x9, sp, #32 +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x9] +; CHECK-NEXT: ptrue p0.d, vl8 +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x2, x12, lsl #3] +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: umov w9, v0.b[8] +; CHECK-NEXT: umov w10, v0.b[9] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: mov v1.b[1], w10 +; CHECK-NEXT: umov w10, v0.b[10] +; CHECK-NEXT: mov v2.b[1], w9 +; CHECK-NEXT: umov w9, v0.b[2] +; CHECK-NEXT: mov v1.b[2], w10 +; CHECK-NEXT: umov w10, v0.b[11] +; CHECK-NEXT: mov v2.b[2], w9 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: mov v1.b[3], w10 +; CHECK-NEXT: umov w10, v0.b[12] +; CHECK-NEXT: mov v2.b[3], w9 +; CHECK-NEXT: umov w9, v0.b[4] +; CHECK-NEXT: mov v1.b[4], w10 +; CHECK-NEXT: umov w10, v0.b[13] +; CHECK-NEXT: mov v2.b[4], w9 +; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: mov v1.b[5], w10 +; CHECK-NEXT: umov w10, v0.b[14] +; CHECK-NEXT: mov v2.b[5], w9 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: mov v1.b[6], w10 +; CHECK-NEXT: umov w10, v0.b[15] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: mov v2.b[6], w9 +; CHECK-NEXT: mov x9, #16 +; CHECK-NEXT: mov v1.b[7], w10 +; CHECK-NEXT: mov x10, #24 +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x2, x9, lsl #3] +; CHECK-NEXT: mov v2.b[7], w11 +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x2, x10, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2] +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: lsl z1.d, p0/m, z1.d, #63 +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: asr z1.d, p0/m, z1.d, #63 +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, #63 +; CHECK-NEXT: cmpne p2.d, p1/z, z1.d, #0 +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: asr z1.d, p0/m, z1.d, #63 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: mov z5.d, p2/m, #0 // =0x0 +; CHECK-NEXT: cmpne p3.d, p1/z, z1.d, #0 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov z6.d, p3/m, #0 // =0x0 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, #63 +; CHECK-NEXT: lsl z1.d, p0/m, z1.d, #63 +; CHECK-NEXT: asr z0.d, p0/m, z0.d, #63 +; CHECK-NEXT: asr z1.d, p0/m, z1.d, #63 +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: cmpne p2.d, p1/z, z1.d, #0 +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: mov z4.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z3.d, p1/m, #0 // =0x0 +; CHECK-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; CHECK-NEXT: st1d { z4.d }, p0, [x8, x10, lsl #3] +; CHECK-NEXT: st1d { z5.d }, p0, [x8, x12, lsl #3] +; CHECK-NEXT: st1d { z6.d }, p0, [x8] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +L.entry: + %0 = load <32 x i1>, <32 x i1>* %arg1 + %1 = load <32 x i1>, <32 x i1>* %arg2 + %2 = and <32 x i1> %0, %1 + %3 = load <32 x i64>, <32 x i64>* %arg3 + %4 = select <32 x i1> %2, <32 x i64> zeroinitializer, <32 x i64> %3 + ret <32 x i64> %4 +}