diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13866,10 +13866,13 @@ NumVecs = 3; break; case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; NumVecs = 4; break; + case Intrinsic::arm_neon_vld1x2: + case Intrinsic::arm_neon_vld1x3: + case Intrinsic::arm_neon_vld1x4: case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: - // TODO: Support updating VLDxDUP nodes. For now, we just skip + // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip // combining base updates for such intrinsics. continue; case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; diff --git a/llvm/test/CodeGen/ARM/pr45824.ll b/llvm/test/CodeGen/ARM/pr45824.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/pr45824.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=armv7-none-linux-eabi < %s | FileCheck %s + +define void @vld1x2([8 x i32] %0) { +; CHECK-LABEL: vld1x2: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: .LBB0_1: @ %.preheader +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %.loopexit +; CHECK-NEXT: @ in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: vst3.8 {d16, d17, d18}, [r0] +; CHECK-NEXT: b .LBB0_1 + %2 = extractvalue [8 x i32] %0, 5 + br label %3 + +3: ; preds = %.loopexit, %1 + %4 = getelementptr inbounds i8, i8* undef, i32 undef + br label %.preheader + +.preheader: ; preds = %.preheader, %3 + %5 = icmp eq i8* %4, undef + br i1 %5, label %.loopexit, label %.preheader + +.loopexit: ; preds = %.preheader + %6 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x2.v8i8.p0i8(i8* %4) + %7 = getelementptr inbounds i8, i8* %4, i32 %2 + %8 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x2.v8i8.p0i8(i8* %7) + tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 1) + br label %3 +} + +define void @vld1x3([8 x i32] %0) { +; CHECK-LABEL: vld1x3: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: .LBB1_1: @ %.preheader +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: bne .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %.loopexit +; CHECK-NEXT: @ in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: vst3.8 {d16, d17, d18}, [r0] +; CHECK-NEXT: b .LBB1_1 + %2 = extractvalue [8 x i32] %0, 5 + br label %3 + +3: ; preds = %.loopexit, %1 + %4 = getelementptr inbounds i8, i8* undef, i32 undef + br label %.preheader + +.preheader: ; preds = %.preheader, %3 + %5 = icmp eq i8* %4, undef + br i1 %5, label %.loopexit, label %.preheader + +.loopexit: ; preds = %.preheader + %6 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x3.v8i8.p0i8(i8* %4) + %7 = getelementptr inbounds i8, i8* %4, i32 %2 + %8 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x3.v8i8.p0i8(i8* %7) + tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 1) + br label %3 +} + +define void @vld1x4([8 x i32] %0) { +; CHECK-LABEL: vld1x4: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: .LBB2_1: @ %.preheader +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: bne .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %.loopexit +; CHECK-NEXT: @ in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: vst3.8 {d16, d17, d18}, [r0] +; CHECK-NEXT: b .LBB2_1 + %2 = extractvalue [8 x i32] %0, 5 + br label %3 + +3: ; preds = %.loopexit, %1 + %4 = getelementptr inbounds i8, i8* undef, i32 undef + br label %.preheader + +.preheader: ; preds = %.preheader, %3 + %5 = icmp eq i8* %4, undef + br i1 %5, label %.loopexit, label %.preheader + +.loopexit: ; preds = %.preheader + %6 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x4.v8i8.p0i8(i8* %4) + %7 = getelementptr inbounds i8, i8* %4, i32 %2 + %8 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x4.v8i8.p0i8(i8* %7) + tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 1) + br label %3 +} + +declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x2.v8i8.p0i8(i8*) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x3.v8i8.p0i8(i8*) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld1x4.v8i8.p0i8(i8*)