I have a comma-separated value (CSV) file as input, and I am supposed to interpolate all missing (nan) values based on neighboring non-diagonal values.
The CSV looks like:
37.454012,95.071431,73.199394,59.865848,nan
15.599452,5.808361,86.617615,60.111501,70.807258
2.058449,96.990985,nan,21.233911,18.182497
nan,30.424224,52.475643,43.194502,29.122914
61.185289,13.949386,29.214465,nan,45.606998
I decided that Python and Pandas would be the most liked by others, but language choice was left to me.
import argparse
import pandas as pd
import sys
import os
parser = argparse.ArgumentParser()
parser._action_groups.pop()
required = parser.add_argument_group('Required arguments')
#optional = parser.add_argument_group('Optional arguments')
required.add_argument('--i', '--input_file', required=True, help = 'input CSV file')
required.add_argument('--o', '--output_file', required=True, help = 'output CSV file')
#optional.add_argument('--m', '--method', required = False, help = 'an option B', default = 'linear')
#return parser.parse_args()
args = parser.parse_args()
if not os.path.isfile(args.i):
print(f"File \"{args.i}\" doesn't exist", file = sys.stderr)
sys.exit(1)
df = pd.read_csv(args.i, header = None)
print(df)
rows, cols = df.shape
inter_df = df
def numeric(val):
if pd.isna(val):
return True
return pd.to_numeric(val)
print(f"rows = {rows}; cols = {cols}")
for i in range(rows):
for j in range(cols):
#print(f"df[{i}][{j}] = {df.iloc[i,j]}")
if numeric(df.iloc[i,j]) == False:
printf(f'df[{i}][{j}] is not numeric, and there may be others') # check for non-numeric values
sys.exit(2)
if pd.notna(df.iloc[i,j]): # don't interpolate known values
continue
adjacent_val = []
if ((i > 0) and (pd.isna(df.iloc[i-1, j]) == False)): # cell that's up
adjacent_val.append(df.iloc[i-1, j])
if ((i < (rows-1)) and (pd.isna(df.iloc[i+1, j]) == False)): # cell that's down
adjacent_val.append(df.iloc[i+1, j])
if ((j > 0) and (pd.isna(df.iloc[i, j-1]) == False)): # left cell
adjacent_val.append(df.iloc[i, j-1])
if (((j+1) < cols) and (pd.isna(df.iloc[i, j+1]) == False)): # right cell
adjacent_val.append(df.iloc[i, j+1])
inter_df.iloc[i,j] = sum(adjacent_val) / len(adjacent_val) # mean
#--------------
# check for errors of various types
#--------------
total_na_vals = inter_df.isna().sum().sum()
if total_na_vals > 0:
print(f"Missing vals were found", file = sys.stderr)
print("Rows with any missing values:\n", inter_df[inter_df.isna().any(axis=1)])
sys.exit(3)
max_df = df.max().max()
max_inter_df = inter_df.max().max()
if max_inter_df > max_df:
print('if interpolating, max value cannot be greater than the original data frame')
sys.exit(4)
min_df = df.min().min()
min_inter_df = inter_df.min().min()
if min_inter_df < min_df:
print('if interpolating, min value cannot be less than the original data frame')
sys.exit(5)
# write results to specified output file
inter_df.to_csv(args.o, sep = ',')
"modular code design, dependency management, and testing" were considered insufficient for this task, even though I verified that the code was producing the correct output and had numerous checks.
How could I have written this better?