Index: llvm/test/Transforms/VectorCombine/AArch64/load-insert-store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/VectorCombine/AArch64/load-insert-store.ll @@ -0,0 +1,226 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -passes=vector-combine -S %s | FileCheck %s + +target triple = "aarch64" + +define void @load_insert_store_a(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_a +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, ptr [[A]], i32 0, i32 1 +; CHECK-NEXT: store i32 [[B]], ptr [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store <4 x i32> %t1, ptr %A + ret void +} + +define <8 x float> @load_insert_store_b(ptr %A, float %B) { +; CHECK-LABEL: define <8 x float> @load_insert_store_b +; CHECK-SAME: (ptr [[A:%.*]], float [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <8 x float>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <8 x float> [[T0]], float [[B]], i32 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i32 0, i32 2 +; CHECK-NEXT: store float [[B]], ptr [[TMP0]], align 8 +; CHECK-NEXT: ret <8 x float> [[T1]] +; +entry: + %t0 = load <8 x float>, ptr %A + %t1 = insertelement <8 x float> %t0, float %B, i32 2 + store <8 x float> %t1, ptr %A + ret <8 x float> %t1 +} + +;; Whether this test can be optimized depends on the TTI. +define void @load_insert_store_c(ptr %A, i16 %B, i16 %C) { +; CHECK-LABEL: define void @load_insert_store_c +; CHECK-SAME: (ptr [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <16 x i16>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <16 x i16> [[T0]], i16 [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <16 x i16> [[T1]], i16 [[C]], i32 6 +; CHECK-NEXT: store <16 x i16> [[T2]], ptr [[A]], align 32 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <16 x i16>, ptr %A + %t1 = insertelement <16 x i16> %t0, i16 %B, i32 4 + %t2 = insertelement <16 x i16> %t1, i16 %C, i32 6 + store <16 x i16> %t2, ptr %A + ret void +} + +;; Whether this test can be optimized depends on the TTI. +define <16 x i16> @load_insert_store_d(ptr %A, i16 %B, i16 %C) { +; CHECK-LABEL: define <16 x i16> @load_insert_store_d +; CHECK-SAME: (ptr [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <16 x i16>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <16 x i16> [[T0]], i16 [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <16 x i16> [[T1]], i16 [[C]], i32 6 +; CHECK-NEXT: store <16 x i16> [[T2]], ptr [[A]], align 32 +; CHECK-NEXT: ret <16 x i16> [[T2]] +; +entry: + %t0 = load <16 x i16>, ptr %A + %t1 = insertelement <16 x i16> %t0, i16 %B, i32 4 + %t2 = insertelement <16 x i16> %t1, i16 %C, i32 6 + store <16 x i16> %t2, ptr %A + ret <16 x i16> %t2 +} + +;; Whether this test can be optimized depends on the TTI. +define void @load_insert_store_e(ptr %A, i16 %B, i16 %C, i16 %D) { +; CHECK-LABEL: define void @load_insert_store_e +; CHECK-SAME: (ptr [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]], i16 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <16 x i16>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <16 x i16> [[T0]], i16 [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <16 x i16> [[T1]], i16 [[C]], i32 6 +; CHECK-NEXT: [[T3:%.*]] = insertelement <16 x i16> [[T2]], i16 [[D]], i32 8 +; CHECK-NEXT: store <16 x i16> [[T3]], ptr [[A]], align 32 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <16 x i16>, ptr %A + %t1 = insertelement <16 x i16> %t0, i16 %B, i32 4 + %t2 = insertelement <16 x i16> %t1, i16 %C, i32 6 + %t3 = insertelement <16 x i16> %t2, i16 %D, i32 8 + store <16 x i16> %t3, ptr %A + ret void +} + +;; Whether this test can be optimized depends on the TTI. +define <8 x float> @load_insert_store_f(ptr %A, float %B, float %C, float %D) { +; CHECK-LABEL: define <8 x float> @load_insert_store_f +; CHECK-SAME: (ptr [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <8 x float>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <8 x float> [[T0]], float [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <8 x float> [[T1]], float [[C]], i32 6 +; CHECK-NEXT: [[T3:%.*]] = insertelement <8 x float> [[T2]], float [[D]], i32 8 +; CHECK-NEXT: store <8 x float> [[T3]], ptr [[A]], align 32 +; CHECK-NEXT: ret <8 x float> [[T3]] +; +entry: + %t0 = load <8 x float>, ptr %A + %t1 = insertelement <8 x float> %t0, float %B, i32 4 + %t2 = insertelement <8 x float> %t1, float %C, i32 6 + %t3 = insertelement <8 x float> %t2, float %D, i32 8 + store <8 x float> %t3, ptr %A + ret <8 x float> %t3 +} + +;; This is not optimized due to the volatile load. +define void @load_insert_store_g(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_g +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load volatile <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store <4 x i32> [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load volatile <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store <4 x i32> %t1, ptr %A + ret void +} + +;; This is not optimized due to the volatile store. +define void @load_insert_store_h(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_h +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store volatile <4 x i32> [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store volatile <4 x i32> %t1, ptr %A + ret void +} + +;; This is not optimized due to the type is not fixed vector type. +define void @load_insert_store_i(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_i +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load , ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load , ptr %A + %t1 = insertelement %t0, i32 %B, i32 1 + store %t1, ptr %A + ret void +} + +;; This is not optimized due to the source is not loaded from memory. +define void @load_insert_store_j(ptr %A, ptr %B, i32 %C) { +; CHECK-LABEL: define void @load_insert_store_j +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = load <4 x i32>, ptr [[B]], align 16 +; CHECK-NEXT: [[T2:%.*]] = add <4 x i32> [[T0]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = insertelement <4 x i32> [[T2]], i32 [[C]], i32 1 +; CHECK-NEXT: store <4 x i32> [[T3]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = load <4 x i32>, ptr %B + %t2 = add <4 x i32> %t0, %t1 + %t3 = insertelement <4 x i32> %t2, i32 %C, i32 1 + store <4 x i32> %t3, ptr %A + ret void +} + +;; This is not optimized due to the source address and +;; the destination address do not match. +define void @load_insert_store_k(ptr %A, ptr %B, i32 %C) { +; CHECK-LABEL: define void @load_insert_store_k +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[C]], i32 1 +; CHECK-NEXT: store <4 x i32> [[T1]], ptr [[B]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %C, i32 1 + store <4 x i32> %t1, ptr %B + ret void +} + +;; This is not optimized due to there are other modification +;; between the vector load and the vector store. +define void @load_insert_store_m(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_m +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store i32 255, ptr [[A]], align 4 +; CHECK-NEXT: store <4 x i32> [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store i32 255, ptr %A + store <4 x i32> %t1, ptr %A + ret void +} Index: llvm/test/Transforms/VectorCombine/RISCV/load-insert-store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/VectorCombine/RISCV/load-insert-store.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -mtriple=riscv64 -mattr=+v -passes=vector-combine -S %s | FileCheck %s + +define void @load_insert_store_a(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_a +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, ptr [[A]], i32 0, i32 1 +; CHECK-NEXT: store i32 [[B]], ptr [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store <4 x i32> %t1, ptr %A + ret void +} + +define <8 x float> @load_insert_store_b(ptr %A, float %B) { +; CHECK-LABEL: define <8 x float> @load_insert_store_b +; CHECK-SAME: (ptr [[A:%.*]], float [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <8 x float>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <8 x float> [[T0]], float [[B]], i32 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i32 0, i32 2 +; CHECK-NEXT: store float [[B]], ptr [[TMP0]], align 8 +; CHECK-NEXT: ret <8 x float> [[T1]] +; +entry: + %t0 = load <8 x float>, ptr %A + %t1 = insertelement <8 x float> %t0, float %B, i32 2 + store <8 x float> %t1, ptr %A + ret <8 x float> %t1 +} + +;; Whether this test can be optimized depends on the TTI. +define void @load_insert_store_c(ptr %A, i16 %B, i16 %C) { +; CHECK-LABEL: define void @load_insert_store_c +; CHECK-SAME: (ptr [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <16 x i16>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <16 x i16> [[T0]], i16 [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <16 x i16> [[T1]], i16 [[C]], i32 6 +; CHECK-NEXT: store <16 x i16> [[T2]], ptr [[A]], align 32 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <16 x i16>, ptr %A + %t1 = insertelement <16 x i16> %t0, i16 %B, i32 4 + %t2 = insertelement <16 x i16> %t1, i16 %C, i32 6 + store <16 x i16> %t2, ptr %A + ret void +} + +;; Whether this test can be optimized depends on the TTI. +define <16 x i16> @load_insert_store_d(ptr %A, i16 %B, i16 %C) { +; CHECK-LABEL: define <16 x i16> @load_insert_store_d +; CHECK-SAME: (ptr [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <16 x i16>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <16 x i16> [[T0]], i16 [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <16 x i16> [[T1]], i16 [[C]], i32 6 +; CHECK-NEXT: store <16 x i16> [[T2]], ptr [[A]], align 32 +; CHECK-NEXT: ret <16 x i16> [[T2]] +; +entry: + %t0 = load <16 x i16>, ptr %A + %t1 = insertelement <16 x i16> %t0, i16 %B, i32 4 + %t2 = insertelement <16 x i16> %t1, i16 %C, i32 6 + store <16 x i16> %t2, ptr %A + ret <16 x i16> %t2 +} + +;; Whether this test can be optimized depends on the TTI. +define void @load_insert_store_e(ptr %A, i16 %B, i16 %C, i16 %D) { +; CHECK-LABEL: define void @load_insert_store_e +; CHECK-SAME: (ptr [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]], i16 [[D:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <16 x i16>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <16 x i16> [[T0]], i16 [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <16 x i16> [[T1]], i16 [[C]], i32 6 +; CHECK-NEXT: [[T3:%.*]] = insertelement <16 x i16> [[T2]], i16 [[D]], i32 8 +; CHECK-NEXT: store <16 x i16> [[T3]], ptr [[A]], align 32 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <16 x i16>, ptr %A + %t1 = insertelement <16 x i16> %t0, i16 %B, i32 4 + %t2 = insertelement <16 x i16> %t1, i16 %C, i32 6 + %t3 = insertelement <16 x i16> %t2, i16 %D, i32 8 + store <16 x i16> %t3, ptr %A + ret void +} + +;; Whether this test can be optimized depends on the TTI. +define <8 x float> @load_insert_store_f(ptr %A, float %B, float %C, float %D) { +; CHECK-LABEL: define <8 x float> @load_insert_store_f +; CHECK-SAME: (ptr [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <8 x float>, ptr [[A]], align 32 +; CHECK-NEXT: [[T1:%.*]] = insertelement <8 x float> [[T0]], float [[B]], i32 4 +; CHECK-NEXT: [[T2:%.*]] = insertelement <8 x float> [[T1]], float [[C]], i32 6 +; CHECK-NEXT: [[T3:%.*]] = insertelement <8 x float> [[T2]], float [[D]], i32 8 +; CHECK-NEXT: store <8 x float> [[T3]], ptr [[A]], align 32 +; CHECK-NEXT: ret <8 x float> [[T3]] +; +entry: + %t0 = load <8 x float>, ptr %A + %t1 = insertelement <8 x float> %t0, float %B, i32 4 + %t2 = insertelement <8 x float> %t1, float %C, i32 6 + %t3 = insertelement <8 x float> %t2, float %D, i32 8 + store <8 x float> %t3, ptr %A + ret <8 x float> %t3 +} + +;; This is not optimized due to the volatile load. +define void @load_insert_store_g(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_g +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load volatile <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store <4 x i32> [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load volatile <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store <4 x i32> %t1, ptr %A + ret void +} + +;; This is not optimized due to the volatile store. +define void @load_insert_store_h(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_h +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store volatile <4 x i32> [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store volatile <4 x i32> %t1, ptr %A + ret void +} + +;; This is not optimized due to the type is not fixed vector type. +define void @load_insert_store_i(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_i +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load , ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load , ptr %A + %t1 = insertelement %t0, i32 %B, i32 1 + store %t1, ptr %A + ret void +} + +;; This is not optimized due to the source is not loaded from memory. +define void @load_insert_store_j(ptr %A, ptr %B, i32 %C) { +; CHECK-LABEL: define void @load_insert_store_j +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = load <4 x i32>, ptr [[B]], align 16 +; CHECK-NEXT: [[T2:%.*]] = add <4 x i32> [[T0]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = insertelement <4 x i32> [[T2]], i32 [[C]], i32 1 +; CHECK-NEXT: store <4 x i32> [[T3]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = load <4 x i32>, ptr %B + %t2 = add <4 x i32> %t0, %t1 + %t3 = insertelement <4 x i32> %t2, i32 %C, i32 1 + store <4 x i32> %t3, ptr %A + ret void +} + +;; This is not optimized due to the source address and +;; the destination address do not match. +define void @load_insert_store_k(ptr %A, ptr %B, i32 %C) { +; CHECK-LABEL: define void @load_insert_store_k +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[C]], i32 1 +; CHECK-NEXT: store <4 x i32> [[T1]], ptr [[B]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %C, i32 1 + store <4 x i32> %t1, ptr %B + ret void +} + +;; This is not optimized due to there are other modification +;; between the vector load and the vector store. +define void @load_insert_store_m(ptr %A, i32 %B) { +; CHECK-LABEL: define void @load_insert_store_m +; CHECK-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T0:%.*]] = load <4 x i32>, ptr [[A]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <4 x i32> [[T0]], i32 [[B]], i32 1 +; CHECK-NEXT: store i32 255, ptr [[A]], align 4 +; CHECK-NEXT: store <4 x i32> [[T1]], ptr [[A]], align 16 +; CHECK-NEXT: ret void +; +entry: + %t0 = load <4 x i32>, ptr %A + %t1 = insertelement <4 x i32> %t0, i32 %B, i32 1 + store i32 255, ptr %A + store <4 x i32> %t1, ptr %A + ret void +}