Note: We attempted CPU pinning (see codebefore switching from std::thread to std::async / std::future), but that made little difference.
#include "flat_hash_map/bytell_hash_map.hpp"
#include "os/fs.hpp"
#include <cstdint>
#include <iostream>
#include <string><future>
#include <string_view><iostream>
#include <thread><string>
#include <unordered_map><string_view>
#include <vector>
using uint64 = std::uint64_t;
using map_t = ska::bytell_hash_map<uint64, uint64>;
// using map_t = std::unordered_map<uint64, uint64>; // ~2.5x slower
// Set the given thread's affinity to be exclusively on the given logical CPU number.
// the hope is to keep the per physcial CPU core L1/L2 cache warm
// results are a bit mixed: 5-10% inconsistent gain, no evidence of loss
void pin_thread_to_cpu(std::thread& t, unsigned cpu_num) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu_num, &cpuset);
int rc = pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset);
if (rc != 0) std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
}
std::pair<const char* const, const char* const> from_sv(std::string_view sv) {
return std::make_pair(sv.data(), sv.data() + sv.size());
}
std::string_view to_sv(const char* const begin, const char* const end) {
return std::string_view{begin, static_cast<std::size_t>(end - begin)};
}
voidmap_t parse(std::string_view buf,) map_t&{
auto map) = map_t{};
auto [begin, end] = from_sv(buf);
const char* curr = begin;
uint64 val = 0;
while (curr != end) {
if (*curr == '\n') {
map[val] += val;
val = 0;
} else {
val = val * 10 + (*curr - '0');
}
++curr; // NOLINT
}
return map;
}
std::vector<std::string_view> chunk(std::string_view whole, int n_chunks, char delim = '\n') {
auto [whole_begin, whole_end] = from_sv(whole);
auto chunk_size = std::ptrdiff_t{(whole_end - whole_begin) / n_chunks};
auto chunks = std::vector<std::string_view>{};
const char* end = whole_begin;
for (int i = 0; end != whole_end && i < n_chunks; ++i) {
const char* begin = end;
if (i == n_chunks - 1) {
end = whole_end; // always ensure last chunk goes to the end
} else {
end = std::min(begin + chunk_size, whole_end); // NOLINT std::min for OOB check
while (end != whole_end && *end != delim) ++end; // NOLINT ensure we have a whole line
if (end != whole_end) ++end; // NOLINT one past the end
}
chunks.push_back(to_sv(begin, end));
}
return chunks;
}
uint64 yahtzee_upper(const std::string& filename) {
auto mfile = os::fs::MemoryMappedFile{filename};
unsigned n_threads = std::thread::hardware_concurrency();
auto chunks = chunk(mfile.get_buffer(), n_threads); // NOLINT
auto threadsfut_maps = std::vector<std::thread>future<map_t>>{};
auto maps =for (std::vector<map_t>{n_threads, map_t{}};
string_view forchunk: chunk(unsigned i = 0; i < n_threads;mfile.get_buffer(), ++in_threads)) { // NOLINT
threadsfut_maps.push_back(std::threadasync(parse, chunks[i], std::ref(maps[i]))); // NOLINT
launch::async, pin_thread_to_cpu(threads[i]parse, ichunk));
}
for (auto&& t: threads) t.join();
uint64 max_total = 0;
auto final_map = map_t{};
for (auto&& mfut_map: mapsfut_maps) {
map_t map = fut_map.get();
for (auto ppair: mmap) {
uint64 total = final_map[pfinal_map[pair.first] += ppair.second;
if (total > max_total) max_total = total;
}
}
std::cout << final_map.size() << "\n";
return max_total;
}
int main(int argc, char* argv[]) {
if (argc < 2) return 1;
std::cout << yahtzee_upper(argv[1]) << '\n'; // NOLINT
return 0;
}