The Wayback Machine - https://web.archive.org/web/20220212130805/https://github.com/pytorch/text/commit/0711c8a01de918fcdf5bdc0d6ccf9300a0d9c1b6
Skip to content
Permalink
Browse files
Added double-conversion library to torchtext (#906)
* Adding function for reading vectors file in cpp

* Updated delimiter to use ascii

* Updating bugs in cpp

* Final cpp implementation

* Created multithreaded cpp load vectors

* Fixing build errors

* Fixing style check

* Resolving PR comments

* Running becnhmarks

* Updated cpp func to use torch::cat

* Using single vector of floats

* Updating reserve tokens

* Updating reserve tokens

* Intermediate implementation with timing benchmark

* Cleaned up comments and timers

* Resolving PR comments

* Readding timings and benchmarking

* Added double conversion library, line seek, and ptr to Tensor impl

* Removed timing code and uncommented benchmark

* Uncommented Glove torch save

* Commented out view caching

* Added double-conversion library

* Removed submodule double-conversion

* Using at::launch for multithreading. Fixed bug with torch.saving cpp vectors object

* Added double conversion to vectors

* Resolving PR comments

* Added a check for processed char count

* fixing lint

* lint
  • Loading branch information
Nayef211 committed Aug 6, 2020
1 parent 5ca2916 commit 0711c8a01de918fcdf5bdc0d6ccf9300a0d9c1b6
Showing with 23 additions and 3 deletions.
  1. +4 −0 .gitmodules
  2. +2 −1 build_tools/setup_helpers/extension.py
  3. +2 −1 third_party/CMakeLists.txt
  4. +1 −0 third_party/double-conversion
  5. +14 −1 torchtext/csrc/vectors.cpp
@@ -6,4 +6,8 @@
path = third_party/re2
url = https://github.com/google/re2
ignore = dirty
[submodule "third_party/double-conversion"]
path = third_party/double-conversion
url = https://github.com/google/double-conversion
ignore = dirty

@@ -79,7 +79,8 @@ def _get_libraries():
return [
'sentencepiece_train',
'sentencepiece',
're2'
're2',
'double-conversion'
]


@@ -8,4 +8,5 @@ endif()
project(thirdparty CXX)
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")

add_subdirectory(re2)
add_subdirectory(re2)
add_subdirectory(double-conversion)
Submodule double-conversion added at b1d531
@@ -1,6 +1,9 @@
#include <ATen/Parallel.h>
#include <atomic>
#include <condition_variable>
#include <double-conversion/double-conversion.h>
#include <double-conversion/ieee.h>
#include <double-conversion/utils.h>
#include <future>
#include <iostream>
#include <mutex>
@@ -172,17 +175,27 @@ void parse_chunk(const std::string &file_path, size_t offset,
fin.open(file_path, std::ios::in);
fin.seekg(offset);

int converter_flags = double_conversion::StringToDoubleConverter::NO_FLAGS;
double_conversion::StringToDoubleConverter converter(
converter_flags, 0.0f, double_conversion::Single::NaN(), NULL, NULL);

for (int64_t i = start_line; i < end_line; i++) {
std::string token;
// read the token
std::getline(fin, token, delimiter);
tokens->push_back(token);

std::string vec_val;
int processed_characters_count;
// read the vector
for (int64_t j = 0; j < vector_dim; j++) {
fin >> vec_val;
data_ptr[i * vector_dim + j] = std::stof(vec_val);
const char *tmp_str = vec_val.c_str();
data_ptr[i * vector_dim + j] = converter.StringToFloat(
tmp_str, strlen(tmp_str), &processed_characters_count);
TORCH_CHECK(processed_characters_count == strlen(tmp_str),
"Processed characters count didn't match vector string "
"length during string to float conversion!");
}
fin >> std::ws;
}

0 comments on commit 0711c8a

Please sign in to comment.