Ok, so far so good, I found a way to reduce the time of execution by storing the dates in a list.
The process would be:
That being said I've got the following snippet:
import pickle
import requests
import pandas as pd
from bs4 import BeautifulSoup
FILE_TO_PROCESS = 'pickle_file.txt'
def get_df_from_file():
with open(FILE_TO_PROCESS, "rb") as openfile:
return pickle.load(openfile).join(pd.DataFrame(columns=['Currents', 'Halftimes', 'Scores']))
def get_html_data_from_url(custom_date):
url = 'http://www.scoresandodds.com/grid_{}.html'.format(custom_date)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'lxml')
rows = soup.find("table", {'class': 'data'}).find_all("tr", {'class': ['team odd', 'team even']})
teams, currents, halftimes, scores = [], [], [], []
for row in rows:
cells = row.find_all("td")
teams.append(cells[0].get_text().encode('utf-8'))
currents.append(cells[3].get_text().encode('utf-8'))
halftimes.append(cells[5].get_text().encode('utf-8'))
scores.append(cells[6].get_text().encode('utf-8'))
data = {
'teams': teams,
'currents': currents,
'halftimes': halftimes,
'scores': scores
}
return data
def process_data():
df_objects = get_df_from_file()
dates = []
first_date = df_objects.iloc[0]['Date']
main_html_data = get_html_data_from_url(first_date)
for index, row in df_objects.iterrows():
if index < 1:
html_data = main_html_data
dates.append(first_date)
else:
if index >= 1 and row['Date'] in dates:
html_data = main_html_data
elif index >= 1 and row['Date'] not in dates:
html_data = get_html_data_from_url(row['Date'])
dates.append(row['Date'])
for index_1, item in enumerate(html_data['teams']):
if row['Team'] in item:
# print('True: {} -> {}; Index: {}'.format(row['Team'], item, index))
df_objects.set_value(index, 'Currents', html_data['currents'][index_1])
df_objects.set_value(index, 'Halftimes', html_data['halftimes'][index_1])
df_objects.set_value(index, 'Scores', html_data['scores'][index_1])
# print('--------------------------')
df_objects.to_csv('results.csv', sep='\t')
if __name__ == '__main__':
process_data()
More, I also realized that there's no need to store the dataframe objects in a list when I could actually only return the dataframe and join the needed extra columns, all in the same function.
If you have any other suggestions, I would strongly recommend you guys to go for it.