This is more of an exercise for me to get use to pandasPandas and its dataframesdataframes. For those who didn't hear of it:
pandasPandas is a Python package providing fast, flexible, and expressive data structures designed to make working with structured (tabular, multidimensional, potentially heterogeneous) and time series data both easy and intuitive
Given some link http://ABCD.abc/some_date.html, take the necessary information from the table on the page.
Say the information looks like this:
Team | Another Team | Col2 | Current | Col4 | Halftime | Scores
Team1 | TeamX | info | Current1 | Col4 | Halftime1 | Scores1
Team2 | TeamY | info | Current2 | Col4 | Halftime2 | Scores2
Team3 | TeamW | info | Current3 | Col4 | Halftime3 | Scores3
Team4 | TeamZ | info | Current4 | Col4 | Halftime4 | Scores4
Team | Another Team | Col2 | Current | Col4 | Halftime | Scores Team1 | TeamX | info | Current1 | Col4 | Halftime1 | Scores1 Team2 | TeamY | info | Current2 | Col4 | Halftime2 | Scores2 Team3 | TeamW | info | Current3 | Col4 | Halftime3 | Scores3 Team4 | TeamZ | info | Current4 | Col4 | Halftime4 | Scores4
| Team | Opponent | Col2 | Col3 Col4 | Col5 | Col6 | Date
0 | Team1 | TeamX | info | info | info | info | info | some_date1 <-- see the link. date goes there in the link
1 | TeamX | Team1 | info | info | info | info | info | some_date2 <-- see the link. date goes there in the link
2 | Team3 | TeamW | info | info | info | info | info | some_date3 <-- see the link. date goes there in the link
3 | TeamW | Team3 | info | info | info | info | info | some_date4 <-- see the link. date goes there in the link
...
and so on
| Team | Opponent | Col2 | Col3 Col4 | Col5 | Col6 | Date 0 | Team1 | TeamX | info | info | info | info | info | some_date1 <-- see the link. date goes there in the link 1 | TeamX | Team1 | info | info | info | info | info | some_date2 <-- see the link. date goes there in the link 2 | Team3 | TeamW | info | info | info | info | info | some_date3 <-- see the link. date goes there in the link 3 | TeamW | Team3 | info | info | info | info | info | some_date4 <-- see the link. date goes there in the link
...
and so on
- Parse each row from the
dataframe(access the link using the date from theDatecolumn of that row), and check if the team from this row can be found in the HTML table. - If you find it, take
Current,HalftimeandScoresfrom the table and add the info into the newly createddataframecolumns. - doDo this for each row from the dataframe.
I'd like some ideas on how can I optimise it, make better use of pandas modules and if there's something wrong with the logic.
You can find the code below:
import pickle
import requests
import pandas as pd
from bs4 import BeautifulSoup
def get_df_from_file(pickle_filename):
objects = []
with open(pickle_filename, "rb") as openfile:
objects.append(pickle.load(openfile))
return objects
def add_new_df_columns():
return get_df_from_file('CFB_15_living-2.p')[0].join(pd.DataFrame(columns=['Currents', 'Halftimes', 'Scores']))
def get_html_data_from_url(custom_date):
url = 'http://www.scoresandodds.com/grid_{}.html'.format(custom_date)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'lxml')
rows = soup.find("table", {'class': 'data'}).find_all("tr", {'class': ['team odd', 'team even']})
teams, currents, halftimes, scores = [], [], [], []
for row in rows:
cells = row.find_all("td")
teams.append(cells[0].get_text().encode('utf-8'))
currents.append(cells[3].get_text().encode('utf-8'))
halftimes.append(cells[5].get_text().encode('utf-8'))
scores.append(cells[6].get_text().encode('utf-8'))
data = {
'teams': teams,
'currents': currents,
'halftimes': halftimes,
'scores': scores
}
return data
def process_data():
df_objects = add_new_df_columns() # data from file
for index, row in df_objects.iterrows():
html_data = get_html_data_from_url(row['Date']) # dict from html
for index_1, item in enumerate(html_data['teams']):
if row['Team'] in item:
# print('True: {} -> {}; Index: {}'.format(row['Team'], item, index))
df_objects.set_value(index, 'Currents', html_data['currents'][index_1])
df_objects.set_value(index, 'Halftimes', html_data['halftimes'][index_1])
df_objects.set_value(index, 'Scores', html_data['scores'][index_1])
print(df_objects)
if __name__ == '__main__':
process_data()
import pickle
import requests
import pandas as pd
from bs4 import BeautifulSoup
def get_df_from_file(pickle_filename):
objects = []
with open(pickle_filename, "rb") as openfile:
objects.append(pickle.load(openfile))
return objects
def add_new_df_columns():
return get_df_from_file('CFB_15_living-2.p')[0].join(pd.DataFrame(columns=['Currents', 'Halftimes', 'Scores']))
def get_html_data_from_url(custom_date):
url = 'http://www.scoresandodds.com/grid_{}.html'.format(custom_date)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'lxml')
rows = soup.find("table", {'class': 'data'}).find_all("tr", {'class': ['team odd', 'team even']})
teams, currents, halftimes, scores = [], [], [], []
for row in rows:
cells = row.find_all("td")
teams.append(cells[0].get_text().encode('utf-8'))
currents.append(cells[3].get_text().encode('utf-8'))
halftimes.append(cells[5].get_text().encode('utf-8'))
scores.append(cells[6].get_text().encode('utf-8'))
data = {
'teams': teams,
'currents': currents,
'halftimes': halftimes,
'scores': scores
}
return data
def process_data():
df_objects = add_new_df_columns() # data from file
for index, row in df_objects.iterrows():
html_data = get_html_data_from_url(row['Date']) # dict from html
for index_1, item in enumerate(html_data['teams']):
if row['Team'] in item:
# print('True: {} -> {}; Index: {}'.format(row['Team'], item, index))
df_objects.set_value(index, 'Currents', html_data['currents'][index_1])
df_objects.set_value(index, 'Halftimes', html_data['halftimes'][index_1])
df_objects.set_value(index, 'Scores', html_data['scores'][index_1])
print(df_objects)
if __name__ == '__main__':
process_data()