diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13894,7 +13894,17 @@ case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: - case Intrinsic::aarch64_neon_ld1x4: + case Intrinsic::aarch66_neon_ld1x4: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.arg_size() - 1); + Info.offset = 0; + Info.align.reset(); + // volatile loads with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOLoad; + return true; + } case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: @@ -13902,9 +13912,13 @@ case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; - // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; - Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + // ldx return struct with the same vec type + Type *RetTy = I.getType(); + auto *StructTy = cast(RetTy); + unsigned NumElts = StructTy->getNumElements(); + Type *VecTy = StructTy->getElementType(0); + MVT EleVT = MVT::getVT(VecTy).getVectorElementType(); + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts); Info.ptrVal = I.getArgOperand(I.arg_size() - 1); Info.offset = 0; Info.align.reset(); @@ -13917,20 +13931,40 @@ case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: - case Intrinsic::aarch64_neon_st1x4: + case Intrinsic::aarch64_neon_st1x4: { + Info.opc = ISD::INTRINSIC_VOID; + unsigned NumElts = 0; + for (const Value *Arg : I.args()) { + Type *ArgTy = Arg->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.arg_size() - 1); + Info.offset = 0; + Info.align.reset(); + // volatile stores with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOStore; + return true; + } case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: { Info.opc = ISD::INTRINSIC_VOID; - // Conservatively set memVT to the entire set of vectors stored. unsigned NumElts = 0; + // all the vector type is same + Type *VecTy = I.getArgOperand(0)->getType(); + MVT EleVT = MVT::getVT(VecTy).getVectorElementType(); + for (const Value *Arg : I.args()) { Type *ArgTy = Arg->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + NumElts += 1; } - Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts); Info.ptrVal = I.getArgOperand(I.arg_size() - 1); Info.offset = 0; Info.align.reset(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -2370,7 +2370,7 @@ } ; CHECK-LABEL: name: test_llvm.aarch64.neon.ld3.v4i32.p0i32 -; CHECK: %1:_(<4 x s32>), %2:_(<4 x s32>), %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld3), %0(p0) :: (load (s384) from %ir.ptr, align 64) +; CHECK: %1:_(<4 x s32>), %2:_(<4 x s32>), %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld3), %0(p0) :: (load (<12 x s32>) from %ir.ptr, align 64) define void @test_llvm.aarch64.neon.ld3.v4i32.p0i32(ptr %ptr) { %arst = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %ptr) ret void diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -O2 | FileCheck %s + +; st2 must before two ldrb. +; The situation that put one ldrb before st2 because of the conservative memVT set for st2lane, +; which lead to basic-aa goes wrong. + +define dso_local i32 @test_vst2_lane_u8([2 x <8 x i8>] %vectors.coerce) local_unnamed_addr { +; CHECK-LABEL: test_vst2_lane_u8: +; CHECK: st2 { v[[V1:[0-9]+]].b, v[[V2:[0-9]+]].b }[6], [x8] +; CHECK-NEXT: umov w[[W1:[0-9]+]], v[[V12:[0-9]+]].b[6] +; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #12] +; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #13] +entry: + %temp = alloca [2 x i8], align 4 + %vectors.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 0 + %vectors.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 1 + call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %temp) #4 + call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> %vectors.coerce.fca.0.extract, <8 x i8> %vectors.coerce.fca.1.extract, i64 6, ptr nonnull %temp) + %0 = load i8, ptr %temp, align 4 + %vget_lane = extractelement <8 x i8> %vectors.coerce.fca.0.extract, i64 6 + %cmp8.not = icmp ne i8 %0, %vget_lane + %arrayidx3.1 = getelementptr inbounds [2 x i8], ptr %temp, i64 0, i64 1 + %1 = load i8, ptr %arrayidx3.1, align 1 + %vget_lane.1 = extractelement <8 x i8> %vectors.coerce.fca.1.extract, i64 6 + %cmp8.not.1 = icmp ne i8 %1, %vget_lane.1 + %or.cond = select i1 %cmp8.not, i1 true, i1 %cmp8.not.1 + %cmp.lcssa = zext i1 %or.cond to i32 + call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %temp) #4 + ret i32 %cmp.lcssa +} + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2 +declare void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8>, <8 x i8>, i64, ptr nocapture) #2 +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2 diff --git a/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll @@ -0,0 +1,106 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s + +%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } +%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } +%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } + +declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float*) +declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float*) +declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float*) + +declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*) +declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*) +declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*) + +declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*) +declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*) +declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*) + +declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) +declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) +declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) + + +define %struct.__neon_float32x2x2_t @test_ld2(float* %addr) { + ; CHECK-LABEL: name: test_ld2 + ; CHECK: LD2Twov2s {{.*}} :: (load (s128) {{.*}}) + %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x2_t %val +} + +define %struct.__neon_float32x2x3_t @test_ld3(float* %addr) { + ; CHECK-LABEL: name: test_ld3 + ; CHECK: LD3Threev2s {{.*}} :: (load (s192) {{.*}}) + %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x3_t %val +} + +define %struct.__neon_float32x2x4_t @test_ld4(float* %addr) { + ; CHECK-LABEL: name: test_ld4 + ; CHECK: LD4Fourv2s {{.*}} :: (load (s256) {{.*}}) + %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x4_t %val +} + +define %struct.__neon_float32x2x2_t @test_ld1x2(float* %addr) { + ; CHECK-LABEL: name: test_ld1x2 + ; CHECK: LD1Twov2s {{.*}} :: (load (s128) {{.*}}) + %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x2_t %val +} + +define %struct.__neon_float32x2x3_t @test_ld1x3(float* %addr) { + ; CHECK-LABEL: name: test_ld1x3 + ; CHECK: LD1Threev2s {{.*}} :: (load (s192) {{.*}}) + %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x3_t %val +} + +define %struct.__neon_float32x2x4_t @test_ld1x4(float* %addr) { + ; CHECK-LABEL: name: test_ld1x4 + ; CHECK: LD1Fourv2s {{.*}} :: (load (s256) {{.*}}) + %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x4_t %val +} + +define %struct.__neon_float32x2x2_t @test_ld2r(float* %addr) { + ; CHECK-LABEL: name: test_ld2r + ; CHECK: LD2Rv2s {{.*}} :: (load (s64) {{.*}}) + %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x2_t %val +} + +define %struct.__neon_float32x2x3_t @test_ld3r(float* %addr) { + ; CHECK-LABEL: name: test_ld3r + ; CHECK: LD3Rv2s {{.*}} :: (load (s96) {{.*}}) + %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x3_t %val +} + +define %struct.__neon_float32x2x4_t @test_ld4r(float* %addr) { + ; CHECK-LABEL: name: test_ld4r + ; CHECK: LD4Rv2s {{.*}} :: (load (s128) {{.*}}) + %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %addr) + ret %struct.__neon_float32x2x4_t %val +} + +define %struct.__neon_float32x2x2_t @test_ld2lane(<2 x float> %a, <2 x float> %b, float* %addr) { + ; CHECK-LABEL: name: test_ld2lane + ; CHECK: {{.*}} LD2i32 {{.*}} + %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, i64 1, float* %addr) + ret %struct.__neon_float32x2x2_t %val +} + +define %struct.__neon_float32x2x3_t @test_ld3lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, float* %addr) { + ; CHECK-LABEL: name: test_ld3lane + ; CHECK: {{.*}} LD3i32 {{.*}} + %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, i64 1, float* %addr) + ret %struct.__neon_float32x2x3_t %val +} + +define %struct.__neon_float32x2x4_t @test_ld4lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, float* %addr) { + ; CHECK-LABEL: name: test_ld4lane + ; CHECK: {{.*}} LD4i32 {{.*}} + %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i64 1, float* %addr) + ret %struct.__neon_float32x2x4_t %val +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll --- a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll +++ b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll @@ -23,8 +23,6 @@ %cr = fadd <4 x float> %cl, %dl %dr = fadd <4 x float> %dl, %al -; The sizes below are conservative. AArch64TargetLowering -; conservatively assumes the entire vector is stored. tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res) ; CHECK: ST2Twov4s {{.*}} :: (store (s256) {{.*}}) tail call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res) @@ -46,8 +44,6 @@ %cr = fadd <4 x float> %cl, %dl %dr = fadd <4 x float> %dl, %al -; The sizes below are conservative. AArch64TargetLowering -; conservatively assumes the entire vector is stored. tail call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res) ; CHECK: ST1Twov4s {{.*}} :: (store (s256) {{.*}}) tail call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res) @@ -69,14 +65,12 @@ %cr = fadd <4 x float> %cl, %dl %dr = fadd <4 x float> %dl, %al -; The sizes below are conservative. AArch64TargetLowering -; conservatively assumes the entire vector is stored. tail call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, i64 1, ptr %res) -; CHECK: ST2i32 {{.*}} :: (store (s256) {{.*}}) +; CHECK: ST2i32 {{.*}} :: (store (s64) {{.*}}) tail call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, ptr %res) -; CHECK: ST3i32 {{.*}} :: (store (s384) {{.*}}) +; CHECK: ST3i32 {{.*}} :: (store (s96) {{.*}}) tail call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, ptr %res) -; CHECK: ST4i32 {{.*}} :: (store (s512) {{.*}}) +; CHECK: ST4i32 {{.*}} :: (store (s128) {{.*}}) ret void }