Index: MicroBenchmarks/LoopVectorization/CMakeLists.txt
===================================================================
--- MicroBenchmarks/LoopVectorization/CMakeLists.txt
+++ MicroBenchmarks/LoopVectorization/CMakeLists.txt
@@ -13,6 +13,7 @@
   main.cpp
   MathFunctions.cpp
   RuntimeChecks.cpp
+  VectorOperations.cpp
 )
 
 target_link_libraries(LoopVectorizationBenchmarks benchmark)
Index: MicroBenchmarks/LoopVectorization/VectorOperations.cpp
===================================================================
--- /dev/null
+++ MicroBenchmarks/LoopVectorization/VectorOperations.cpp
@@ -0,0 +1,75 @@
+#include <iostream>
+#include <memory>
+#include <random>
+
+#include "benchmark/benchmark.h"
+
+#define ITERATIONS 10000
+
+static std::mt19937 rng;
+
+// Initialize array A with random numbers.
+template <typename Ty>
+static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
+  std::uniform_int_distribution<uint64_t> distrib(
+      std::numeric_limits<Ty>::min(), std::numeric_limits<Ty>::max());
+  for (unsigned i = 0; i < N; i++)
+    A[i] = static_cast<Ty>(distrib(rng));
+}
+
+// Truncate & add each vector element from T to int8_t in a vectorized loop with vectorization width 8
+template <typename T> static void truncVecInLoopWithVW8(const T *A, uint8_t *B, int iterations) {
+#pragma clang loop vectorize_width(8) interleave_count(4)
+  for (unsigned i = 0; i < iterations; i++) {
+    B[i] += A[i];
+  }
+}
+
+template <typename Ty> static void __attribute__((always_inline))
+benchForTruncVecInLoopWithVW8(benchmark::State &state) {
+  std::unique_ptr<Ty[]> A(new Ty[ITERATIONS]);
+  std::unique_ptr<uint8_t[]> B(new uint8_t[ITERATIONS]);
+  init_data(A, ITERATIONS);
+  init_data(B, ITERATIONS);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(B);
+    benchmark::ClobberMemory();
+    truncVecInLoopWithVW8(&A[0], &B[0], ITERATIONS);
+  }
+}
+
+// Truncate & add each vector element from T to int8_t in a vectorized loop with vectorization width 16
+template <typename T> static void truncVecInLoopWithVW16(const T *A, uint8_t *B, int iterations) {
+#pragma clang loop vectorize_width(16) interleave_count(4)
+  for (unsigned i = 0; i < iterations; i++) {
+    B[i] += A[i];
+  }
+}
+
+template <typename Ty> static void __attribute__((always_inline))
+benchForTruncVecInLoopWithVW16(benchmark::State &state) {
+  std::unique_ptr<Ty[]> A(new Ty[ITERATIONS]);
+  std::unique_ptr<uint8_t[]> B(new uint8_t[ITERATIONS]);
+  init_data(A, ITERATIONS);
+  init_data(B, ITERATIONS);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(B);
+    benchmark::ClobberMemory();
+    truncVecInLoopWithVW16(&A[0], &B[0], ITERATIONS);
+  }
+}
+
+// Add vectorized truncate operation benchmarks for different types
+#define ADD_BENCHMARK(ty)                                                  \
+  void benchForTruncVecInLoopFor_##ty##_(benchmark::State &state) {             \
+    benchForTruncVecInLoopWithVW8<ty>(state);                   \
+  }                                                                            \
+  BENCHMARK(benchForTruncVecInLoopFor_##ty##_); \
+  void benchForTruncVecInLoopWithVW16For_##ty##_(benchmark::State &state) {             \
+    benchForTruncVecInLoopWithVW16<ty>(state);                   \
+  }                                                                            \
+  BENCHMARK(benchForTruncVecInLoopFor_##ty##_); \
+
+ADD_BENCHMARK(uint64_t)
+ADD_BENCHMARK(uint32_t)
+ADD_BENCHMARK(uint16_t)