diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16387,6 +16387,29 @@ } } + // Index = shl((step(const) + splat(offset))), splat(shift)) + if (Index.getOpcode() == ISD::SHL && + Index.getOperand(0).getOpcode() == ISD::ADD && + Index.getOperand(0).getOperand(0).getOpcode() == ISD::STEP_VECTOR) { + SDValue Add = Index.getOperand(0); + SDValue ShiftOp = Index.getOperand(1); + SDValue StepOp = Add.getOperand(0); + SDValue OffsetOp = Add.getOperand(1); + if (auto *Shift = + dyn_cast_or_null(DAG.getSplatValue(ShiftOp))) + if (auto Offset = DAG.getSplatValue(OffsetOp)) { + int64_t Step = + cast(StepOp.getOperand(0))->getSExtValue(); + // Stride does not scale explicitly by 'Scale', because it happens in + // the gather/scatter addressing mode. + Stride = Step << Shift->getSExtValue(); + // BasePtr = BasePtr + ((Offset * Scale) << Shift) + Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, N->getScale()); + Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, SDValue(Shift, 0)); + BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); + } + } + // Return early because no supported pattern is found. if (Stride == 0) return false; diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll @@ -201,9 +201,92 @@ ret void } +; Ensure the resulting load is "vscale x 4" wide, despite the offset giving the +; impression the gather must be split due to it's offset. +; gather_f32(base, index(offset, 8 * sizeof(float)) +define @gather_8i8_index_offset_8([8 x i8]* %base, i64 %offset, %pg) #0 { +; CHECK-LABEL: gather_8i8_index_offset_8: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #3 +; CHECK-NEXT: index z0.s, #0, #8 +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x8, z0.s, sxtw] +; CHECK-NEXT: ret + %t0 = insertelement undef, i64 %offset, i32 0 + %t1 = shufflevector %t0, undef, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + %t2 = add %t1, %step + %t3 = getelementptr [8 x i8], [8 x i8]* %base, %t2 + %t4 = bitcast %t3 to + %load = call @llvm.masked.gather.nxv4i8( %t4, i32 4, %pg, undef) + ret %load +} + +; Ensure the resulting load is "vscale x 4" wide, despite the offset giving the +; impression the gather must be split due to it's offset. +; gather_f32(base, index(offset, 8 * sizeof(float)) +define @gather_f32_index_offset_8([8 x float]* %base, i64 %offset, %pg) #0 { +; CHECK-LABEL: gather_f32_index_offset_8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: add x9, x0, x1, lsl #5 +; CHECK-NEXT: index z0.s, #0, w8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, z0.s, sxtw] +; CHECK-NEXT: ret + %t0 = insertelement undef, i64 %offset, i32 0 + %t1 = shufflevector %t0, undef, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + %t2 = add %t1, %step + %t3 = getelementptr [8 x float], [8 x float]* %base, %t2 + %t4 = bitcast %t3 to + %load = call @llvm.masked.gather.nxv4f32( %t4, i32 4, %pg, undef) + ret %load +} + +; Ensure the resulting store is "vscale x 4" wide, despite the offset giving the +; impression the scatter must be split due to it's offset. +; scatter_f16(base, index(offset, 8 * sizeof(i8)) +define void @scatter_i8_index_offset_8([8 x i8]* %base, i64 %offset, %pg, %data) #0 { +; CHECK-LABEL: scatter_i8_index_offset_8: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #3 +; CHECK-NEXT: index z1.s, #0, #8 +; CHECK-NEXT: st1b { z0.s }, p0, [x8, z1.s, sxtw] +; CHECK-NEXT: ret + %t0 = insertelement undef, i64 %offset, i32 0 + %t1 = shufflevector %t0, undef, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + %t2 = add %t1, %step + %t3 = getelementptr [8 x i8], [8 x i8]* %base, %t2 + %t4 = bitcast %t3 to + call void @llvm.masked.scatter.nxv4i8( %data, %t4, i32 2, %pg) + ret void +} + +; Ensure the resulting store is "vscale x 4" wide, despite the offset giving the +; impression the scatter must be split due to it's offset. +; scatter_f16(base, index(offset, 8 * sizeof(half)) +define void @scatter_f16_index_offset_8([8 x half]* %base, i64 %offset, %pg, %data) #0 { +; CHECK-LABEL: scatter_f16_index_offset_8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: add x9, x0, x1, lsl #4 +; CHECK-NEXT: index z1.s, #0, w8 +; CHECK-NEXT: st1h { z0.s }, p0, [x9, z1.s, sxtw] +; CHECK-NEXT: ret + %t0 = insertelement undef, i64 %offset, i32 0 + %t1 = shufflevector %t0, undef, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + %t2 = add %t1, %step + %t3 = getelementptr [8 x half], [8 x half]* %base, %t2 + %t4 = bitcast %t3 to + call void @llvm.masked.scatter.nxv4f16( %data, %t4, i32 2, %pg) + ret void +} + attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } +declare @llvm.masked.gather.nxv4f32(, i32, , ) declare @llvm.masked.gather.nxv4i8(, i32, , ) declare void @llvm.masked.scatter.nxv4i8(, , i32, )