Skip to main content
deleted 1 character in body
Source Link
Algo
  • 339
  • 2
  • 11

My main focus wasis to avoid loops as much as possible by using vectorization, and letting numpy's functions do all the work.

My main focus was to avoid loops as much as possible by using vectorization, and letting numpy's functions do all the work.

My main focus is to avoid loops as much as possible by using vectorization, and letting numpy's functions do all the work.

Source Link
Algo
  • 339
  • 2
  • 11

Implementation of linear regression in Python

I wrote an implementation for multivariate linear regression in Python, for data in this link: http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv

My main focus was to avoid loops as much as possible by using vectorization, and letting numpy's functions do all the work.

How can I improve my code for performance and enhanced vectorization?

This is my code:

import numpy as np
import matplotlib.pyplot as plt

# read data matrix
data = np.genfromtxt('Advertising.csv', delimiter=',',dtype=float, skiprows=1)
m, n = data.shape

X = data[:, 0:n-1]
Y = data[:, n-1].reshape((m, 1))

# add unity vector column with size m to the X matrix, to account for theta_0
ones = np.ones((m, 1))
X = np.hstack((ones, X))

iterations = 500        # gradient descent iterations count
alpha = 0.01            # learning rate

theta = np.random.rand(n, 1)

def plotData():
    # get rid of X_0 constants column
    features = X[:, 1: n]
    plt.figure(1)

    plt.subplot(131)
    plt.plot(features[:, 0], Y, 'bo')
    plt.title('Ad dollars spent on TV')
    plt.ylabel('Sales')

    plt.subplot(132)
    plt.plot(features[:, 1], Y, 'ro')
    plt.title('Ad dollars spent on Radio')
    plt.ylabel('Sales')

    plt.subplot(133)
    plt.plot(features[:, 2], Y, 'yo')
    plt.title('Ad dollars spent on Newspaper')
    plt.ylabel('Sales')

    plt.show()

def computeCost():
    hypothesis = np.dot(X, theta);
    delta = np.dot((hypothesis - Y).transpose(), (hypothesis - Y))
    return (1 / m) * delta

# normalizeFeatures: scale-normalize all features to speed up gradient descent convergence
def normalizeFeatures():
    # 1) generate Average vector mu, contains the average of each feature in the X matrix
    # 2) generate Std. deviation vector sigma, contains the std. dev. of each feature in the X matrix
    # 3) subtract average value and divide by the standard deviation, for each feature column
    mu = np.ones((1, n))
    sigma = np.ones((1, n))

    # range() starts from 1 not 0, to skip the first all-ones constants column in the features matrix
    for i in range(1,n):
        mu[0][i] = np.mean(X[:, i])
        sigma[0][i] = np.std(X[:, i])
        X[:, i] = (X[:, i] - mu[0][i]) / sigma[0][i]

    return mu, sigma;

# gradientDescent() calculates hypothesis equation coefficients using gradient descent algorithm
def gradientDescent(theta):
    # vector to keep track of progression of cost function with each iteration
    J_history = np.ones((iterations, 1))

    for i in range(iterations):
        delta = np.dot((np.dot(X,theta) - Y).transpose(), X).transpose()
        theta -= (alpha/m) * delta
        J_history[i, 0] = computeCost()

    plt.plot(np.linspace(0, iterations, iterations), J_history)
    plt.title('Cost function against number of iterations')
    plt.xlabel('Number of iterations')
    plt.ylabel('Cost function J(theta)')
    plt.show()
    return

# normalEquation() calculates hypothesis equation coefficients analytically
def normalEquation():
    A = np.linalg.pinv(np.dot(X.transpose(), X))
    B = np.dot(X.transpose(), Y)
    theta = np.dot(A, B)

    return theta

def predict(x_vector, mu, sigma):
    # scale feature vector
    for i in range(1, n):
        x_vector[0][i] = (x_vector[0,i]- mu[0][i]) / sigma[0][i]

    return np.dot(x_vector, theta)

if __name__ == '__main__':
    plotData()
    mu, sigma = normalizeFeatures()
    gradientDescent(theta)
    print('Hypothesis coefficients from gradient descent:\n {}'.format(theta))
    print('Hypothesis coefficients from normal equation:\n {}'.format(normalEquation()))

    prediction_vector = np.array([1, 40, 40, 48]).reshape(1,4)
    print('Prediction for values [1, 40, 40, 48] is {}'.format(predict(prediction_vector, mu, sigma)))