I currently have the following python code that adds a few calculated columns to my consol file. Essentially it combines all the sales files into one combined DF and then adds 4 new sales columns with some conditions back to the consol file. The code works, but it takes about an hour to complete.
For reference the combined sales data probably adds up to about 2m~ rows, whilst the consol data contains 40k rows to which it is adding the sales when the criteria match. Is there anything that could be done to optimize the code to run a bit faster?
import os
import pandas as pd
# Folder paths
consol_folder = r'C:\Users\user\Desktop\consol_data'
sales_folder = r'C:\Users\user\Desktop\PowerQuery_sales'
output_folder = r'C:\Users\user\Desktop\output'
output_file = os.path.join(output_folder, 'output.xlsx')
# Check if output file already exists and remove it if it does
if os.path.exists(output_file):
os.remove(output_file)
# Importing sales files
sales_files = os.listdir(sales_folder)
sales_data = pd.DataFrame()
for i, file in enumerate(sales_files):
print(f"Processing sales file {i+1} of {len(sales_files)}: {file}")
if file.endswith('.xlsx'):
file_path = os.path.join(sales_folder, file)
df = pd.read_excel(file_path)
sales_data = pd.concat([sales_data, df], ignore_index=True)
print(f"Sales files imported and combined!")
# Importing consol file
consol_file = os.path.join(consol_folder, '2023_07_16 consolfile_updated.xlsx')
consol_data = pd.read_excel(consol_file)
print(f"Consol file imported!")
# Adding new columns to the consol file
consol_data['30D sales'] = 0
consol_data['60D sales'] = 0
consol_data['90D sales'] = 0
consol_data['YTD sales'] = 0
# Calculation & filters to apply
for index, row in consol_data.iterrows():
material_no = row['MaterialNo']
customer_type = row['CustomerType']
sale_group = row['SaleGroup']
on_shelf_date = row['on-shelf date']
c_o = row['C/O']
if customer_type in ['FS', 'MB']:
sales_filtered = sales_data[
(sales_data['MaterialNo'] == material_no) &
(sales_data['CustomerType'] == customer_type) &
(sales_data['SaleGroup'] == sale_group) &
(sales_data['NetSalesDate'] >= on_shelf_date)
]
else:
sales_filtered = sales_data[
(sales_data['MaterialNo'] == material_no) &
(sales_data['CustomerType'] == customer_type) &
(sales_data['NetSalesDate'] >= on_shelf_date)
]
consol_data.at[index, '30D sales'] = sales_filtered[
(sales_filtered['NetSalesDate'] >= on_shelf_date) &
(sales_filtered['NetSalesDate'] <= on_shelf_date + pd.DateOffset(days=30))
]['Invoiced/Sold/Shipped Qty'].sum()
consol_data.at[index, '60D sales'] = sales_filtered[
(sales_filtered['NetSalesDate'] >= on_shelf_date) &
(sales_filtered['NetSalesDate'] <= on_shelf_date + pd.DateOffset(days=60))
]['Invoiced/Sold/Shipped Qty'].sum()
consol_data.at[index, '90D sales'] = sales_filtered[
(sales_filtered['NetSalesDate'] >= on_shelf_date) &
(sales_filtered['NetSalesDate'] <= on_shelf_date + pd.DateOffset(days=90))
]['Invoiced/Sold/Shipped Qty'].sum()
# Calculate YTD sales
next_on_shelf_date = consol_data.loc[
(consol_data['MaterialNo'] == material_no) &
(consol_data['on-shelf date'] > on_shelf_date),
'on-shelf date'
].min()
if pd.isnull(next_on_shelf_date):
consol_data.at[index, 'YTD sales'] = sales_filtered['Invoiced/Sold/Shipped Qty'].sum()
else:
if c_o == "C/O":
consol_data.at[index, 'YTD sales'] = sales_filtered[
(sales_filtered['NetSalesDate'] >= on_shelf_date) &
(sales_filtered['NetSalesDate'] < next_on_shelf_date)
]['Invoiced/Sold/Shipped Qty'].sum()
else:
consol_data.at[index, 'YTD sales'] = sales_filtered['Invoiced/Sold/Shipped Qty'].sum()
print(f"Processed row {index+1} of {len(consol_data)}")
# Save the result to a new Excel file
consol_data.to_excel(output_file, index=False)
print("Processing complete. Output file saved")