I need to classify URLs from a DataFrame and modify it by exact match and contains conditions:
class PageClassifier:
def __init__(self, contains_pat, match_pat):
"""
:param match_pat: A dict with exact match patterns in values as lists
:type match_pat: dict
:param contains_pat: A dict with contains patterns in values as lists
:type contains_pat: dict
"""
self.match_pat = match_pat
self.contains_pat = contains_pat
def worker(self, data_frame):
"""
Classifies pages by type (url patterns)
:param data_frame: DataFrame to classify
:return: Classified by URL patterns DataFrame
"""
try:
for key, value in self.contains_pat.items():
reg_exp = '|'.join(value)
data_frame.loc[data_frame['url'].str.contains(reg_exp, regex=True), ['page_class']] = key
for key, value in self.match_pat.items():
data_frame.loc[data_frame['url'].isin(value), ['page_class']] = key
return data_frame
except Exception as e:
print('page_classifier(): ', e, type(e))
df = pd.read_csv('logs.csv',
delimiter='\t', parse_dates=['date'],
chunksize=1000000)
contains = {'catalog': ['/category/', '/tags', '/search'], 'resources': ['.css', '.js', '.woff', '.ttf', '.html', '.php']}
match = {'info_pages': ['/information', '/about-us']}
classify = PageClassifier(contains, match)
new_pd = pd.DataFrame()
for num, chunk in enumerate(df):
print('Start chunk ', num)
new_pd = pd.concat([new_pd, classify.worker(chunk)])
new_pd.to_csv('classified.csv', sep='\t', index=False)
But it is very slow and takes to much RAM when I work with files over 10GB. How can I search and modify data faster? I need "exact match" and "contains" patterns searching in one func.