I am creating a binary file format to store a series of vectors alongside some metadata. The vectors will be stored at the start of the file which will have a predetermined size, and the metadata alongside the size of each vector will go to the end.
data = {{1, 2, 3}, {5, 6, 7}}
metadata = {'1011', '1110'}
file:
-->
1235670000
0000000000
0000000000
3111031011
<--
So every time I need a new vector I will read the next available metadata block and size, e.g., 31011, that tells me I need to get 3 items from the current position at the top of the file and so I will end up returning ({1, 2, 3}, '1011')
The idea I want to test is if storing all my data in contiguous memory addresses will take advantage of caching and speed things up a bit if I have a very large amount of vectors to be used by an off memory algorithm.
My implementation is a .h file containing my class:
#pragma once
#include <fstream>
#include <utility>
#include <vector>
#include <filesystem>
#include <bitset>
class BinaryFile {
public:
// TODO: metadataWritePos_{static_cast<std::streamoff>(maxFileSize - metadataSize_ - 1)}, am I wasting one bit?
// TODO: store the number of vectors in the file to resume write/read from a different instance
BinaryFile(std::string filename, std::size_t maxFileSize, std::size_t metadataSize)
: filename_{std::move(filename)}, maxFileSize_{maxFileSize},
metadataSize_{metadataSize + sizeof(long)},
metadataWritePos_{static_cast<std::streamoff>(maxFileSize - metadataSize_)},
metadataReadPos_{static_cast<std::streamoff>(maxFileSize - metadataSize_)} {
file_.open(filename_, std::ios::binary | std::ios::in | std::ios::out);
if (!file_.is_open()) {
// If the file does not exist, create it.
file_.open(filename_, std::ios::binary | std::ios::out);
file_.close();
// Re-open the file in binary mode for both reading and writing.
file_.open(filename_, std::ios::binary | std::ios::in | std::ios::out);
}
}
// TODO: accept a vector of `T` as the data type.
template <std::size_t N>
bool write(const std::vector<int> &data, std::bitset<N> metadata) {
if (file_.is_open()) {
// Check if there is enough space at the beginning and end of the file.
const auto dataSize = data.size() * sizeof(int);
const auto totalSize = dataSize + metadataSize_;
const auto curPos = dataWritePos_;
const auto spaceAvailable = static_cast<std::size_t>(metadataWritePos_ - curPos);
if (spaceAvailable < totalSize) {
return false;
}
// Write the data to the file.
file_.seekp(curPos);
file_.write(reinterpret_cast<const char *>(data.data()), static_cast<std::streamoff>(dataSize));
// Write the metadata to the end of the file.
const std::streampos metadataPos = metadataWritePos_;
file_.seekp(metadataPos);
file_.write(reinterpret_cast<const char*>(&metadata), static_cast<std::streamoff>(metadataSize_ - sizeof(long)));
file_.write(reinterpret_cast<const char*>(&dataSize), sizeof(long));
// Update the data & metadata position for the next write.
metadataWritePos_ -= static_cast<std::streamoff>(metadataSize_);
dataWritePos_ += static_cast<std::streamoff>(dataSize);
return true;
}
return false;
}
template <std::size_t N>
std::pair<std::vector<int>, std::bitset<N>> readNext() {
std::pair<std::vector<int>, std::bitset<N>> result;
if (file_.is_open()) {
const std::streampos curPos = file_.tellg();
if (metadataWritePos_ <= metadataReadPos_) {
// Read the metadata hash and vector size from the end of the file.
std::size_t dataSize;
const auto metadataPos = metadataReadPos_;
file_.seekg(metadataPos);
file_.read(reinterpret_cast<char *>(&result.second), static_cast<std::streamoff>(metadataSize_ - sizeof(long)));
file_.read(reinterpret_cast<char *>(&dataSize), sizeof(long));
// Read the data from the current position.
const std::size_t elementCount = dataSize / sizeof(int);
result.first.resize(elementCount);
file_.seekg(dataReadPos_);
file_.read(reinterpret_cast<char *>(std::get<0>(result).data()), static_cast<std::streamoff>(dataSize));
// Update the metadata position for the next read.
metadataReadPos_ -= static_cast<std::streamoff>(metadataSize_);
dataReadPos_ += static_cast<std::streamoff>(dataSize);
}
}
return result;
}
// TODO: is there a better way to keep track of these values between instance?
std::streamoff getLastWritenPosition() const {
return metadataWritePos_;
}
void setLastWritenPosition(std::streamoff pos) {
metadataWritePos_ = pos;
}
private:
std::string filename_;
std::size_t maxFileSize_;
std::size_t metadataSize_;
std::streamoff metadataWritePos_;
std::streamoff metadataReadPos_;
std::streamoff dataWritePos_{};
std::streamoff dataReadPos_{};
std::fstream file_;
};
and I am using it in this way:
#include <iostream>
#include <vector>
#include <numeric>
#include "serializer.h"
#define HASH_SIZE 4
void test_write_multiple() {
// Create a binary writer with some data.
const std::string filename = "test.bin";
const std::size_t maxFileSize = 1024;
BinaryFile writer(filename, maxFileSize, sizeof(std::bitset<HASH_SIZE>));
const std::vector<std::vector<int>> data = {
{1, 2, 3},
{4, 5, 6},
{7, 8, 9},
{3, 2, 1}
};
const std::vector<std::bitset<HASH_SIZE>> metadata = {
std::bitset<4>("0101"), // equivalent to decimal value 5
std::bitset<4>("1100"), // equivalent to decimal value 12
std::bitset<4>("0000"), // equivalent to decimal value 0
std::bitset<4>("1111") // equivalent to decimal value 15
};
for (std::size_t i = 0; i < data.size(); ++i) {
if (!writer.write(data[i], metadata[i])) {
std::cout << "::Write failed due to max writer size.\n";
return;
}
}
// Create a reader to get the original data back.
BinaryFile reader(filename, maxFileSize, sizeof(std::bitset<HASH_SIZE>));
reader.setLastWritenPosition(writer.getLastWritenPosition());
for (std::size_t i = 0; i < data.size(); ++i) {
const auto [readData, readMetadata] = reader.readNext<HASH_SIZE>();
// Check that the read data matches the original data.
if (readData != data[i]) {
std::cout << "::Read data does not match original data.\n";
return;
}
// Check that the read metadata matches the original metadata.
if (readMetadata != metadata[i]) {
std::cout << "::Read metadata does not match original metadata.\n";
return;
}
}
std::cout << "Multiple write test done.\n";
}
Are there any obvious mistakes, any optimizations I could have performed. Is the design even any good, or should I structure the class and the data it handles differently?