I am relatively new to machine learning and I believe one of the best ways for me to get the intuition behind most algorithms is to write them from scratch before using tons of external libraries.
This classifier I wrote seems to be yielding reasonable results based on the dataset I provided. This dataset is based on the number of hours that a student studied for a test (x), and the score this same student got in the test (y).
I tried to exploit OOP as much as I could, instead of using a procedural approach to write the algorithm.
Would you mind giving me your opinions and comments about this code? This is also important because I'll be adding to my portfolio. Are there some missing good practices in the code? What would you recommend keeping and removing in a professional setting or for life as a developer?
Univariate linear regression algorithm:
# Linear equation based on: y = m * x + b, which is the same as h = theta1 * x + theta0
import numpy as np
class LinearRegressionModel():
"""
Univariate linear regression model classifier.
"""
def __init__(self, dataset, learning_rate, num_iterations):
"""
Class constructor.
"""
self.dataset = np.array(dataset)
self.b = 0 # Initial guess value for 'b'.
self.m = 0 # Initial guess value for 'm'.
self.learning_rate = learning_rate
self.num_iterations = num_iterations
self.M = len(self.dataset) # 100.
self.total_error = 0
def apply_gradient_descent(self):
"""
Runs the gradient descent step 'num_iterations' times.
"""
for i in range(self.num_iterations):
self.do_gradient_step()
def do_gradient_step(self):
"""
Performs each step of gradient descent, tweaking 'b' and 'm'.
"""
b_summation = 0
m_summation = 0
# Doing the summation here.
for i in range(self.M):
x_value = self.dataset[i, 0]
y_value = self.dataset[i, 1]
b_summation += (((self.m * x_value) + self.b) - y_value) # * 1
m_summation += (((self.m * x_value) + self.b) - y_value) * x_value
# Updating parameter values 'b' and 'm'.
self.b = self.b - (self.learning_rate * (1/self.M) * b_summation)
self.m = self.m - (self.learning_rate * (1/self.M) * m_summation)
# At this point. Gradient descent is finished.
def compute_error(self):
"""
Computes the total error based on the linear regression cost function.
"""
for i in range(self.M):
x_value = self.dataset[i, 0]
y_value = self.dataset[i, 1]
self.total_error += ((self.m * x_value) + self.b) - y_value
return self.total_error
def __str__(self):
return "Results: b: {}, m: {}, Final Total error: {}".format(round(self.b, 2), round(self.m, 2), round(self.compute_error(), 2))
def get_prediction_based_on(self, x):
return round(float((self.m * x) + self.b), 2) # Type: Numpy float.
def main():
# Loading dataset.
school_dataset = np.genfromtxt(DATASET_PATH, delimiter=",")
# Creating 'LinearRegressionModel' object.
lr = LinearRegressionModel(school_dataset, 0.0001, 1000)
# Applying gradient descent.
lr.apply_gradient_descent()
# Getting some predictions.
hours = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for hour in hours:
print("Studied {} hours and got {} points.".format(hour, lr.get_prediction_based_on(hour)))
# Printing the class attribute values.
print(lr)
if __name__ == "__main__": main()
Dataset snippet:
32.502345269453031,31.70700584656992
53.426804033275019,68.77759598163891
61.530358025636438,62.562382297945803
47.475639634786098,71.546632233567777
59.813207869512318,87.230925133687393
55.142188413943821,78.211518270799232
52.550014442733818,71.300879886850353
45.419730144973755,55.165677145959123
