I am working on a personal Machine-Learning (ML) project to predict weather. Right now, I am working on Jupyter Notebook. Once I am done with it, I will start working on converting it into a Flask app.
I have completed my code on Jupyter Notebook. All the codes are working fine. However, I am not sure if I am doing everything correctly.
Would you please review my code on GitHub and inform me if I am on the right way or not?
Here is my code:
# %% [markdown]
# # Seattle Weather Category Prediction - Jupyter Notebook
#
# ## This notebook aims to predict the weather category (e.g., sun, rain, snow)
# # based on other meteorological features. It follows a structure similar to
# # the provided Kaggle example, using Gaussian Naive Bayes.
#
# ## Steps:
# (Original numbered steps removed as per feedback, structure implied by headers)
# - Load and Explore Data
# - Visualize Data
# - Preprocess Data (Label Encoding for target, add lagged features)
# - Train and Evaluate Models
# - Conduct Ablation Study
# - Save the Model and Label Encoder for Flask App
# %% [markdown]
# ## Setup and Load Data
# %%
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io # To load CSV from string in some environments (not strictly used by pd.read_csv here)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
# %%
# Display plots inline
%matplotlib inline
# Set some display options for Pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# %% [markdown]
# ### Helper Functions
# %%
def print_header(message):
"""Prints a formatted header message."""
print(f"\n{message}:")
def load_and_preprocess_data(file_path):
"""Loads the dataset and performs initial date conversion."""
df = pd.read_csv(file_path)
print_header("Dataset loaded successfully")
df['date'] = pd.to_datetime(df['date']) # Convert date column to datetime
return df
def create_visualization_df(dataframe):
"""Creates a copy of the dataframe for visualization and extracts year/month."""
df_vis = dataframe.copy()
df_vis['year'] = df_vis['date'].dt.year
df_vis['month'] = df_vis['date'].dt.month
return df_vis
def create_lag_and_delta_features(df_input, features_to_lag, temp_features_for_delta, lag_period=1):
"""
Adds lagged versions of specified features and delta features for temperature.
Assumes df_input is already sorted by date and contains necessary base features.
"""
df_out = df_input.copy() # Work on a copy
# Ensure DataFrame is sorted by date if 'date' column exists
# This is crucial for time-series operations like shift()
if 'date' in df_out.columns:
df_out = df_out.sort_values(by='date')
else:
# If no 'date' column, we assume the user has pre-sorted it externally.
# For this specific notebook, 'date' will be present in df_for_lagged.
print("Warning: 'date' column not present for sorting. Assuming pre-sorted data for lag features.")
# Create lagged features
for feature in features_to_lag:
df_out[f'{feature}_lag{lag_period}'] = df_out[feature].shift(lag_period)
# Create delta (difference) features for temperature
for temp_feature in temp_features_for_delta:
current_feature_name = temp_feature # e.g., 'temp_max'
lagged_feature_name = f'{temp_feature}_lag{lag_period}' # e.g., 'temp_max_lag1'
if lagged_feature_name in df_out.columns: # Ensure the lagged feature exists
# Using a more descriptive delta name, e.g., 'delta_temp_max'
df_out[f'delta_{temp_feature}'] = df_out[current_feature_name] - df_out[lagged_feature_name]
else:
print(f"Warning: Lagged feature {lagged_feature_name} not found for delta calculation of {current_feature_name}.")
# Drop rows with NaN values introduced by shifting (typically the first 'lag_period' rows)
df_out = df_out.dropna().reset_index(drop=True)
return df_out
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, target_names, model_name="Model"):
"""Trains a model and prints evaluation metrics."""
print_header(f"--- {model_name} Training and Evaluation ---")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=target_names, zero_division=0)
print(f"Accuracy: {accuracy:.4f}")
print_header("Confusion Matrix")
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix - {model_name}')
plt.show()
print_header("Classification Report")
print(classification_rep)
return model, accuracy
# %%
# --- Load the dataset ---
file_path = 'seattle-weather.csv'
df = load_and_preprocess_data(file_path)
df.head()
# %% [markdown]
# ## Initial Data Exploration & Visualization
# %%
print_header("Dataset Info")
df.info()
# %%
print_header("Statistical Summary")
print(df.describe())
# %%
print_header("Missing Values Check")
print(df.isnull().sum()) # Should be 0 for this dataset
print(f"Any NA values present: {df.isna().sum().any()}")
# %%
print_header("Duplicate Rows Check")
print(f"Number of duplicated rows: {df.duplicated().sum()}") # Should be 0 for this dataset
# %%
print_header("Day with Minimum temp_min")
print(df[df['temp_min']==min(df.temp_min)])
# %%
print_header("Day with Maximum temp_max")
print(df[df['temp_max']==max(df.temp_max)])
# %%
# Define consistent bin edges for temperature histograms
# Using the feedback's suggestion for overall min/max across both temp columns
t_min_overall = pd.concat([df['temp_min'], df['temp_max']]).min()
t_max_overall = pd.concat([df['temp_min'], df['temp_max']]).max()
# Create bins with a width of 1 degree Celsius spanning the overall temperature range
bins = np.arange(np.floor(t_min_overall), np.ceil(t_max_overall) + 1, 1)
plt.figure(figsize=(12,6))
sns.histplot(data=df, x='temp_max', bins=bins, kde=True, palette="viridis")
plt.title('Distribution of Maximum Temperature')
plt.xlabel('Max Temperature (°C)')
plt.ylabel('Frequency')
plt.xlim(bins.min(), bins.max()) # Set x-axis limits based on calculated bins
plt.xticks(bins[::2]) # Show fewer ticks for clarity
plt.show()
# %%
plt.figure(figsize=(12,6))
sns.histplot(data=df, x='temp_min', bins=bins, kde=True, palette="viridis")
plt.title('Distribution of Minimum Temperature')
plt.xlabel('Min Temperature (°C)')
plt.ylabel('Frequency')
plt.xlim(bins.min(), bins.max()) # Set x-axis limits based on calculated bins
plt.xticks(bins[::2]) # Show fewer ticks for clarity
plt.show()
# %% [markdown]
# ### FacetGrid Visualizations (Month vs. Weather Variables by Year)
# First, convert 'date' to datetime and extract 'year' and 'month'.
# %%
df_vis = create_visualization_df(df)
# %%
# Max Temperature vs. Month by Year
g = sns.FacetGrid(df_vis, col='year', col_wrap=4, height=3.5, aspect=1.2)
g.map(sns.lineplot, 'month', 'temp_max', errorbar=None) # errorbar=None to remove confidence intervals for clarity
g.set_axis_labels('Month', 'Max Temperature (°C)')
g.set_titles(col_template="{col_name}")
g.fig.suptitle('Max Temperature by Month for Each Year', y=1.03) # Add a main title
plt.tight_layout()
plt.show()
# %%
# Min Temperature vs. Month by Year
g = sns.FacetGrid(df_vis, col='year', col_wrap=4, height=3.5, aspect=1.2)
g.map(sns.lineplot, 'month', 'temp_min', errorbar=None)
g.set_axis_labels('Month', 'Min Temperature (°C)')
g.set_titles(col_template="{col_name}")
g.fig.suptitle('Min Temperature by Month for Each Year', y=1.03)
plt.tight_layout()
plt.show()
# %%
# Precipitation vs. Month by Year
g = sns.FacetGrid(df_vis, col='year', col_wrap=4, height=3.5, aspect=1.2)
# Lineplot might be better than scatter for trends
g.map(sns.lineplot, 'month', 'precipitation', errorbar=None)
g.set_axis_labels('Month', 'Precipitation (mm)')
g.set_titles(col_template="{col_name}")
g.fig.suptitle('Precipitation by Month for Each Year', y=1.03)
plt.tight_layout()
plt.show()
# %%
# Wind Speed vs. Month by Year
g = sns.FacetGrid(df_vis, col='year', col_wrap=4, height=3.5, aspect=1.2)
# Lineplot for trends
g.map(sns.lineplot, 'month', 'wind', errorbar=None)
g.set_axis_labels('Month', 'Wind Speed')
g.set_titles(col_template="{col_name}")
g.fig.suptitle('Wind Speed by Month for Each Year', y=1.03)
plt.tight_layout()
plt.show()
# %% [markdown]
# ### Weather Category Distribution
# %%
print_header("Weather Category Counts")
weather_counts = df['weather'].value_counts()
print(weather_counts)
# %%
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='weather', order=weather_counts.index, hue='weather', palette="viridis", legend=False)
plt.title('Distribution of Weather Types')
plt.xlabel('Weather Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
# %%
plt.figure(figsize=(10, 8))
plt.pie(weather_counts, labels=weather_counts.index, autopct='%1.1f%%', startangle=140,
colors=sns.color_palette("viridis", len(weather_counts)))
plt.title('Distribution of Weather Types (Pie Chart)')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# %% [markdown]
# ## Data Preprocessing for Classification
# The Kaggle notebook drops 'year' and 'month' after visualization and does not use 'date'.
# It then label encodes 'weather' for the target variable.
# %%
# Create a working copy of the dataframe for preprocessing
df_processed = df.copy()
# Drop the 'date' column as it won't be used directly as a feature in this specific approach.
# Note: For more advanced time-series models, date components or the date itself could be crucial.
# The Kaggle example's feature set is ['temp_min', 'temp_max', 'precipitation', 'wind'].
if 'date' in df_processed.columns:
df_processed = df_processed.drop('date', axis=1)
print_header("DataFrame columns before modeling")
print(df_processed.columns.tolist())
df_processed.head()
# %%
# Label Encode the target variable 'weather'
le = LabelEncoder()
df_processed['weather_encoded'] = le.fit_transform(df_processed['weather'])
# Display the mapping
print_header("Label Encoding Mapping for 'weather'")
for i, class_name in enumerate(le.classes_):
print(f"{class_name} -> {i}")
# %%
# Save the label encoder for use in the Flask app (to decode predictions)
joblib.dump(le, 'weather_label_encoder.joblib')
print_header("Saved weather_label_encoder.joblib")
df_processed.head()
# %% [markdown]
# ### Adding Lagged Time Series Features
# We'll create features based on the previous day's observations to potentially improve model performance.
# %%
# Prepare DataFrame for lagged features: sort by date and encode 'weather'
df_for_lagged_processing = df.sort_values(by='date').copy() # Use a distinct name
# Label Encode the target variable 'weather' specifically for this lagged features DataFrame
# This ensures that 'weather_encoded' is available for lagging.
le_lagged = LabelEncoder() # Renamed from le_lag
df_for_lagged_processing['weather_encoded'] = le_lagged.fit_transform(df_for_lagged_processing['weather'])
# Define features to lag and temperature features for delta calculation
features_to_lag_list = ['precipitation', 'temp_max', 'temp_min', 'wind', 'weather_encoded']
temp_features_for_delta_list = ['temp_max', 'temp_min']
# Use the helper function to create lagged and delta features
df_with_lags = create_lag_and_delta_features(
df_input=df_for_lagged_processing,
features_to_lag=features_to_lag_list,
temp_features_for_delta=temp_features_for_delta_list,
lag_period=1
)
print_header("DataFrame with Lagged Features (from helper function)")
print(df_with_lags.head())
# Prepare the final DataFrame for training with lagged features
# Drop original 'date' and 'weather' (text) columns as they are not model inputs
# 'weather_encoded' (the target) and newly created features are kept.
df_processed_lagged = df_with_lags.drop(columns=['date', 'weather'], errors='ignore')
print_header("df_processed_lagged head")
print(df_processed_lagged.head())
# %% [markdown]
# ## Feature Selection and Train-Test Split
# %%
# Original features based on Kaggle example
original_features = ['temp_min', 'temp_max', 'precipitation', 'wind']
X_original = df_processed[original_features]
y_original = df_processed['weather_encoded']
# Features including lagged data
# Ensure 'weather_encoded' is the target, and all other relevant columns are features
# The target 'weather_encoded' in df_processed_lagged is the current day's weather.
# The feature 'weather_encoded_lag1' is the previous day's weather.
lagged_features_input_list = ['temp_min', 'temp_max', 'precipitation', 'wind',
'precipitation_lag1', 'temp_max_lag1', 'temp_min_lag1',
'wind_lag1', 'weather_encoded_lag1',
'delta_temp_max', 'delta_temp_min'] # Updated delta feature names
X_lagged = df_processed_lagged[lagged_features_input_list]
y_lagged = df_processed_lagged['weather_encoded'] # Target remains the same (current day's weather)
# Store the feature names model will be trained on (for Flask app input)
# We will use the original features for the primary model saved for Flask
feature_names_for_model = X_original.columns.tolist()
joblib.dump(feature_names_for_model, 'classifier_feature_names.joblib')
print(f"Saved classifier_feature_names.joblib with features: {feature_names_for_model}")
# Split data - using random split as per Kaggle example for primary model
# stratify=y is good for imbalanced classes
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(
X_original, y_original, test_size=0.2, random_state=42, stratify=y_original
)
print_header("Original Data Split Shapes")
print(f"Original X_train shape: {X_train_original.shape}, y_train shape: {y_train_original.shape}")
print(f"Original X_test shape: {X_test_original.shape}, y_test shape: {y_test_original.shape}")
# Split data for lagged features
X_train_lagged, X_test_lagged, y_train_lagged, y_test_lagged = train_test_split(
X_lagged, y_lagged, test_size=0.2, random_state=42, stratify=y_lagged
)
print_header("Lagged Data Split Shapes")
print(f"Lagged X_train shape: {X_train_lagged.shape}, y_train shape: {y_train_lagged.shape}")
print(f"Lagged X_test shape: {X_test_lagged.shape}, y_test shape: {y_test_lagged.shape}")
# %% [markdown]
# ## Naïve Model (Climate Prediction)
# A simple baseline model that predicts the most frequent weather type for each month.
# %%
# Extract month from date for the naive model
df_naive = df.copy()
df_naive['month'] = df_naive['date'].dt.month
# Determine the most frequent weather type for each month
monthly_most_frequent_weather = df_naive.groupby('month')['weather'].agg(lambda x: x.mode()[0] if not x.mode().empty else "Unknown")
print_header("Most frequent weather type per month (Naïve Model)")
print(monthly_most_frequent_weather)
# Evaluate the naive model
df_naive['predicted_weather_naive'] = df_naive['month'].map(monthly_most_frequent_weather)
naive_accuracy = accuracy_score(df_naive['weather'], df_naive['predicted_weather_naive'])
print(f"\nNaïve Model Accuracy (Predicting most frequent weather by month): {naive_accuracy:.4f}")
print("This simple model predicts the 'climate' for each month, rather than specific 'weather'.")
# %% [markdown]
# ## Model Training and Evaluation
# %%
# --- Gaussian Naive Bayes (Original Features) ---
nb_model_original, nb_accuracy_original = train_and_evaluate_model(
GaussianNB(), X_train_original, y_train_original, X_test_original, y_test_original,
le.classes_, "Gaussian Naive Bayes (Original Features)"
)
# --- Gaussian Naive Bayes (Lagged Features) ---
# Note: le_lagged.classes_ should be used here if the encoding was different.
# However, if the set of weather types is the same across the dataset,
# le.classes_ and le_lagged.classes_ will be identical. For robustness, use the one
# associated with the y_lagged target. Since y_lagged came from df_processed_lagged,
# which used le_lagged for 'weather_encoded', le_lagged.classes_ is appropriate.
nb_model_lagged, nb_accuracy_lagged = train_and_evaluate_model(
GaussianNB(), X_train_lagged, y_train_lagged, X_test_lagged, y_test_lagged,
le_lagged.classes_, "Gaussian Naive Bayes (Lagged Features)"
)
# --- Logistic Regression (Original Features) ---
lr_model_original, lr_accuracy_original = train_and_evaluate_model(
LogisticRegression(max_iter=1000, random_state=42), X_train_original, y_train_original, X_test_original, y_test_original,
le.classes_, "Logistic Regression (Original Features)"
)
# --- Support Vector Machine (Original Features) ---
# For SVM, using a linear kernel for simplicity and speed. RBF is also common but can be slower.
# Adjust 'C' for regularization if needed.
svm_model_original, svm_accuracy_original = train_and_evaluate_model(
SVC(random_state=42), X_train_original, y_train_original, X_test_original, y_test_original,
le.classes_, "Support Vector Machine (Original Features)"
)
# %% [markdown]
# ## Ablation Study
# Let's see how different features contribute to the model's performance (using Gaussian Naive Bayes with original features).
# %%
features_to_ablate = [
['temp_min', 'temp_max', 'precipitation', 'wind'], # All original
['temp_min', 'temp_max', 'precipitation'], # Without wind
['temp_min', 'temp_max', 'wind'], # Without precipitation
['precipitation', 'wind'], # Only precipitation and wind
['temp_max', 'precipitation', 'wind'], # Without temp_min
['temp_min', 'precipitation', 'wind'], # Without temp_max
['temp_max'], # Only temp_max
['wind'] # Only wind
]
ablation_results = {}
print_header("--- Ablation Study (Gaussian Naive Bayes with Original Features) ---")
for i, current_features in enumerate(features_to_ablate):
print(f"\nTraining with features: {current_features}")
# Use df_processed which has 'weather_encoded' and original features (no date)
X_ablation = df_processed[current_features]
y_ablation = df_processed['weather_encoded'] # Target is always the same
X_train_ab, X_test_ab, y_train_ab, y_test_ab = train_test_split(
X_ablation, y_ablation, test_size=0.2, random_state=42, stratify=y_ablation
)
model = GaussianNB()
model.fit(X_train_ab, y_train_ab)
y_pred_ab = model.predict(X_test_ab)
accuracy_ab = accuracy_score(y_test_ab, y_pred_ab)
ablation_results[tuple(current_features)] = accuracy_ab
print(f"Accuracy: {accuracy_ab:.4f}")
print_header("--- Ablation Study Summary ---")
for features, acc in ablation_results.items():
print(f"Features: {features} -> Accuracy: {acc:.4f}")
# %% [markdown]
# ## Save the Model for Flask App
# We'll save the best performing model (or the original Gaussian Naive Bayes as initially planned) and the label encoder.
# For demonstration, we'll save the original Gaussian Naive Bayes model.
# %%
# Save the Gaussian Naive Bayes model (using original features)
joblib.dump(nb_model_original, 'weather_prediction_model.joblib')
print_header("Saved weather_prediction_model.joblib (Gaussian Naive Bayes with original features)")
The code on above is based on feedback that I received from my previous question regarding this project. Here is the link: ML Project - Weather Prediction on Jupyter Notebook
I just need to be sure if I have managed to implement the feedback accurately and I am doing everything correctly before transforming the code of Jupyter Notebook into a Flask app.