@@ -997,6 +997,7 @@ def __init__(self, path_or_buf, convert_dates=True,
997997 self .path_or_buf = BytesIO (contents )
998998
999999 self ._read_header ()
1000+ self ._setup_dtype ()
10001001
10011002 def __enter__ (self ):
10021003 """ enter context manager """
@@ -1299,6 +1300,23 @@ def _read_old_header(self, first_char):
12991300 # necessary data to continue parsing
13001301 self .data_location = self .path_or_buf .tell ()
13011302
1303+ def _setup_dtype (self ):
1304+ # Setup the dtype.
1305+ if self ._dtype is not None :
1306+ return self ._dtype
1307+
1308+ dtype = [] # Convert struct data types to numpy data type
1309+ for i , typ in enumerate (self .typlist ):
1310+ if typ in self .NUMPY_TYPE_MAP :
1311+ dtype .append (('s' + str (i ), self .byteorder +
1312+ self .NUMPY_TYPE_MAP [typ ]))
1313+ else :
1314+ dtype .append (('s' + str (i ), 'S' + str (typ )))
1315+ dtype = np .dtype (dtype )
1316+ self ._dtype = dtype
1317+
1318+ return self ._dtype
1319+
13021320 def _calcsize (self , fmt ):
13031321 return (type (fmt ) is int and fmt or
13041322 struct .calcsize (self .byteorder + fmt ))
@@ -1472,24 +1490,12 @@ def read(self, nrows=None, convert_dates=None,
14721490 if nrows is None :
14731491 nrows = self .nobs
14741492
1475- if (self .format_version >= 117 ) and (self ._dtype is None ):
1493+ if (self .format_version >= 117 ) and (not self ._value_labels_read ):
14761494 self ._can_read_value_labels = True
14771495 self ._read_strls ()
14781496
1479- # Setup the dtype.
1480- if self ._dtype is None :
1481- dtype = [] # Convert struct data types to numpy data type
1482- for i , typ in enumerate (self .typlist ):
1483- if typ in self .NUMPY_TYPE_MAP :
1484- dtype .append (('s' + str (i ), self .byteorder +
1485- self .NUMPY_TYPE_MAP [typ ]))
1486- else :
1487- dtype .append (('s' + str (i ), 'S' + str (typ )))
1488- dtype = np .dtype (dtype )
1489- self ._dtype = dtype
1490-
14911497 # Read data
1492- dtype = self ._dtype
1498+ dtype = self ._setup_dtype ()
14931499 max_read_len = (self .nobs - self ._lines_read ) * dtype .itemsize
14941500 read_len = nrows * dtype .itemsize
14951501 read_len = min (read_len , max_read_len )
@@ -1958,7 +1964,6 @@ def _prepare_categoricals(self, data):
19581964 return data
19591965
19601966 get_base_missing_value = StataMissingValue .get_base_missing_value
1961- index = data .index
19621967 data_formatted = []
19631968 for col , col_is_cat in zip (data , is_cat ):
19641969 if col_is_cat :
@@ -1981,8 +1986,7 @@ def _prepare_categoricals(self, data):
19811986
19821987 # Replace missing values with Stata missing value for type
19831988 values [values == - 1 ] = get_base_missing_value (dtype )
1984- data_formatted .append ((col , values , index ))
1985-
1989+ data_formatted .append ((col , values ))
19861990 else :
19871991 data_formatted .append ((col , data [col ]))
19881992 return DataFrame .from_items (data_formatted )
0 commit comments