One option is to use Numba. It will consume less memory and actually be faster too.
import pandas as pd
import numpy as np
import numba as nb
def compute_distances(df):
i = df.values.T
result = np.empty((len(i), len(i)), dtype=i.dtype)
_compute_distances_nb(i, result)
return result
@nb.njit(parallel=True)
def _compute_distances_nb(data, result):
for i in nb.prange(data.shape[0]):
for j in nb.prange(data.shape[0]):
s = 0
for k in nb.prange(data.shape[1]):
d = data[i, k] - data[j, k]
if not np.isnan(d):
s += np.square(d)
result[i, j] = np.sqrt(s)
# Original method for comparison
def compute_distances_np(df):
i = df.values.T
return np.nansum((i - i[:, None]) ** 2, axis=2) ** .5
# Test
np.random.seed(0)
# Make random data
df = pd.DataFrame(np.random.random((100, 500)))
# Put some NaN values
df[np.random.random(df.shape) < .2] = np.nan
# Compute distances
d1 = compute_distances(df)
d2 = compute_distances_np(df)
print(np.allclose(d1, d2))
# True
%timeit compute_distances(df)
# 8.05 ms ± 698 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit compute_distances_np(df)
# 356 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
np.random.seed(0)
# Make some random data
df = pd.DataFrame(np.random.random((100, 500)))
# Put some NaN values
df[np.random.random(df.shape) < .2] = np.nan
# Compute distances
d1 = compute_distances(df)
d2 = compute_distances_np(df)
print(np.allclose(d1, d2))
%timeit compute_distances(df)
# 8.05 ms ± 698 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit compute_distances_np(df)
# 356 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
np.nansum? If you are okay with just replacing nan values with zeros in the original array, you can usescipy.spatial.distance.pdist.