Revisions to Multi Threaded High Performance txt file parsing

Tweeted twitter.com/StackCodeReview/status/1223848680056332288

occurred Feb 2, 2020 at 6:00

added 216 characters in body

Source Link

edited Jan 28, 2020 at 14:03

Oliver Schönrock

2.5k
1
10
23

EDIT

A port of Björn's answer to C++ with further improvements at bottom achieving up to 2.4GB/s on the reference machine.

No answers yet, so I am updating code for stylistic changes only -- use of std::async instead of std::thread and -- returning a std::future<map_t>

Source Link

edited Jan 12, 2020 at 21:47

Oliver Schönrock

2.5k
1
10
23

Note: We attempted CPU pinning (see codebefore switching from std::thread to std::async / std::future), but that made little difference.

#include "flat_hash_map/bytell_hash_map.hpp"
#include "os/fs.hpp"
#include <cstdint>
#include <iostream>
#include <string><future>
#include <string_view><iostream>
#include <thread><string>
#include <unordered_map><string_view>
#include <vector>

using uint64 = std::uint64_t;
using map_t  = ska::bytell_hash_map<uint64, uint64>;
// using map_t  = std::unordered_map<uint64, uint64>;  // ~2.5x slower

// Set the given thread's affinity to be exclusively on the given logical CPU number.
// the hope is to keep the per physcial CPU core L1/L2 cache warm
// results are a bit mixed: 5-10% inconsistent gain, no evidence of loss
void pin_thread_to_cpu(std::thread& t, unsigned cpu_num) {
  cpu_set_t cpuset;
  CPU_ZERO(&cpuset);
  CPU_SET(cpu_num, &cpuset);
  int rc = pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset);
  if (rc != 0) std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
}

std::pair<const char* const, const char* const> from_sv(std::string_view sv) {
  return std::make_pair(sv.data(), sv.data() + sv.size());
}

std::string_view to_sv(const char* const begin, const char* const end) {
  return std::string_view{begin, static_cast<std::size_t>(end - begin)};
}

voidmap_t parse(std::string_view buf,) map_t&{
  auto map)          = map_t{};
  auto [begin, end] = from_sv(buf);
  const char* curr  = begin;
  uint64      val   = 0;
  while (curr != end) {
    if (*curr == '\n') {
      map[val] += val;
      val = 0;
    } else {
      val = val * 10 + (*curr - '0');
    }
    ++curr; // NOLINT
  }
  return map;
}

std::vector<std::string_view> chunk(std::string_view whole, int n_chunks, char delim = '\n') {
  auto [whole_begin, whole_end] = from_sv(whole);
  auto        chunk_size        = std::ptrdiff_t{(whole_end - whole_begin) / n_chunks};
  auto        chunks            = std::vector<std::string_view>{};
  const char* end               = whole_begin;
  for (int i = 0; end != whole_end && i < n_chunks; ++i) {
    const char* begin = end;
    if (i == n_chunks - 1) {
      end = whole_end; // always ensure last chunk goes to the end
    } else {
      end = std::min(begin + chunk_size, whole_end);   // NOLINT std::min for OOB check
      while (end != whole_end && *end != delim) ++end; // NOLINT ensure we have a whole line
      if (end != whole_end) ++end;                     // NOLINT one past the end
    }
    chunks.push_back(to_sv(begin, end));
  }
  return chunks;
}

uint64 yahtzee_upper(const std::string& filename) {
  auto     mfile     = os::fs::MemoryMappedFile{filename};
  unsigned n_threads = std::thread::hardware_concurrency();

  auto chunks  = chunk(mfile.get_buffer(), n_threads); // NOLINT
  auto threadsfut_maps = std::vector<std::thread>future<map_t>>{};
  auto maps    =for (std::vector<map_t>{n_threads, map_t{}};

 string_view forchunk: chunk(unsigned i = 0; i < n_threads;mfile.get_buffer(), ++in_threads)) { // NOLINT
    threadsfut_maps.push_back(std::threadasync(parse, chunks[i], std::ref(maps[i]))); // NOLINT
   launch::async, pin_thread_to_cpu(threads[i]parse, ichunk));
  }

  for (auto&& t: threads) t.join();

  uint64 max_total = 0;
  auto   final_map = map_t{};
  for (auto&& mfut_map: mapsfut_maps) {
    map_t map = fut_map.get();
    for (auto ppair: mmap) {
      uint64 total = final_map[pfinal_map[pair.first] += ppair.second;
      if (total > max_total) max_total = total;
    }
  }
  std::cout << final_map.size() << "\n";
  return max_total;
}

int main(int argc, char* argv[]) {
  if (argc < 2) return 1;
  std::cout << yahtzee_upper(argv[1]) << '\n'; // NOLINT
  return 0;
}

Note: We attempted CPU pinning (see code), but that made little difference.

#include "flat_hash_map/bytell_hash_map.hpp"
#include "os/fs.hpp"
#include <cstdint>
#include <iostream>
#include <string>
#include <string_view>
#include <thread>
#include <unordered_map>
#include <vector>

using uint64 = std::uint64_t;
using map_t  = ska::bytell_hash_map<uint64, uint64>;
// using map_t  = std::unordered_map<uint64, uint64>;  // ~2.5x slower

// Set the given thread's affinity to be exclusively on the given logical CPU number.
// the hope is to keep the per physcial CPU core L1/L2 cache warm
// results are a bit mixed: 5-10% inconsistent gain, no evidence of loss
void pin_thread_to_cpu(std::thread& t, unsigned cpu_num) {
  cpu_set_t cpuset;
  CPU_ZERO(&cpuset);
  CPU_SET(cpu_num, &cpuset);
  int rc = pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset);
  if (rc != 0) std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
}

std::pair<const char* const, const char* const> from_sv(std::string_view sv) {
  return std::make_pair(sv.data(), sv.data() + sv.size());
}

std::string_view to_sv(const char* const begin, const char* const end) {
  return std::string_view{begin, static_cast<std::size_t>(end - begin)};
}

void parse(std::string_view buf, map_t& map) {
  auto [begin, end] = from_sv(buf);
  const char* curr  = begin;
  uint64      val   = 0;
  while (curr != end) {
    if (*curr == '\n') {
      map[val] += val;
      val = 0;
    } else {
      val = val * 10 + (*curr - '0');
    }
    ++curr;
  }
}

std::vector<std::string_view> chunk(std::string_view whole, int n_chunks, char delim = '\n') {
  auto [whole_begin, whole_end] = from_sv(whole);
  auto chunk_size = std::ptrdiff_t{(whole_end - whole_begin) / n_chunks};
  auto chunks = std::vector<std::string_view>{};
  const char* end    = whole_begin;
  for (int i = 0; end != whole_end && i < n_chunks; ++i) {
    const char* begin = end;
    if (i == n_chunks - 1) {
      end = whole_end; // always ensure last chunk goes to the end
    } else {
      end = std::min(begin + chunk_size, whole_end);   // NOLINT std::min for OOB check
      while (end != whole_end && *end != delim) ++end; // NOLINT ensure we have a whole line
      if (end != whole_end) ++end;                     // NOLINT one past the end
    }
    chunks.push_back(to_sv(begin, end));
  }
  return chunks;
}

uint64 yahtzee_upper(const std::string& filename) {
  auto     mfile     = os::fs::MemoryMappedFile{filename};
  unsigned n_threads = std::thread::hardware_concurrency();

  auto chunks  = chunk(mfile.get_buffer(), n_threads); // NOLINT
  auto threads = std::vector<std::thread>{};
  auto maps    = std::vector<map_t>{n_threads, map_t{}};

  for (unsigned i = 0; i < n_threads; ++i) {
    threads.push_back(std::thread(parse, chunks[i], std::ref(maps[i]))); // NOLINT
    pin_thread_to_cpu(threads[i], i);
  }

  for (auto&& t: threads) t.join();

  uint64 max_total = 0;
  auto   final_map = map_t{};
  for (auto&& m: maps) {
    for (auto p: m) {
      uint64 total = final_map[p.first] += p.second;
      if (total > max_total) max_total = total;
    }
  }
  return max_total;
}

int main(int argc, char* argv[]) {
  if (argc < 2) return 1;
  std::cout << yahtzee_upper(argv[1]) << '\n'; // NOLINT
  return 0;
}

Note: We attempted CPU pinning (before switching from std::thread to std::async / std::future), but that made little difference.

#include "flat_hash_map/bytell_hash_map.hpp"
#include "os/fs.hpp"
#include <cstdint>
#include <future>
#include <iostream>
#include <string>
#include <string_view>
#include <vector>

using uint64 = std::uint64_t;
using map_t  = ska::bytell_hash_map<uint64, uint64>;

std::pair<const char* const, const char* const> from_sv(std::string_view sv) {
  return std::make_pair(sv.data(), sv.data() + sv.size());
}

std::string_view to_sv(const char* const begin, const char* const end) {
  return std::string_view{begin, static_cast<std::size_t>(end - begin)};
}

map_t parse(std::string_view buf) {
  auto map          = map_t{};
  auto [begin, end] = from_sv(buf);
  const char* curr  = begin;
  uint64      val   = 0;
  while (curr != end) {
    if (*curr == '\n') {
      map[val] += val;
      val = 0;
    } else {
      val = val * 10 + (*curr - '0');
    }
    ++curr; // NOLINT
  }
  return map;
}

std::vector<std::string_view> chunk(std::string_view whole, int n_chunks, char delim = '\n') {
  auto [whole_begin, whole_end] = from_sv(whole);
  auto        chunk_size        = std::ptrdiff_t{(whole_end - whole_begin) / n_chunks};
  auto        chunks            = std::vector<std::string_view>{};
  const char* end               = whole_begin;
  for (int i = 0; end != whole_end && i < n_chunks; ++i) {
    const char* begin = end;
    if (i == n_chunks - 1) {
      end = whole_end; // always ensure last chunk goes to the end
    } else {
      end = std::min(begin + chunk_size, whole_end);   // NOLINT std::min for OOB check
      while (end != whole_end && *end != delim) ++end; // NOLINT ensure we have a whole line
      if (end != whole_end) ++end;                     // NOLINT one past the end
    }
    chunks.push_back(to_sv(begin, end));
  }
  return chunks;
}

uint64 yahtzee_upper(const std::string& filename) {
  auto     mfile     = os::fs::MemoryMappedFile{filename};
  unsigned n_threads = std::thread::hardware_concurrency();

  auto fut_maps = std::vector<std::future<map_t>>{};
  for (std::string_view chunk: chunk(mfile.get_buffer(), n_threads)) { // NOLINT
    fut_maps.push_back(std::async(std::launch::async, parse, chunk));
  }

  uint64 max_total = 0;
  auto   final_map = map_t{};
  for (auto&& fut_map: fut_maps) {
    map_t map = fut_map.get();
    for (auto pair: map) {
      uint64 total = final_map[pair.first] += pair.second;
      if (total > max_total) max_total = total;
    }
  }
  std::cout << final_map.size() << "\n";
  return max_total;
}

int main(int argc, char* argv[]) {
  if (argc < 2) return 1;
  std::cout << yahtzee_upper(argv[1]) << '\n'; // NOLINT
  return 0;
}

clarification

Source Link

edited Jan 12, 2020 at 20:11

Oliver Schönrock

2.5k
1
10
23

We attempted CPU pinning (see code), but that made little difference.

Note: We attempted CPU pinning (see code), but that made little difference.

Stack Exchange Network

Return to Question