I'm finally creating a class to analyse my data in a more streamlined way. It takes a CSV file and outputs some information about the table and its columns.
class Analyses:
def Types_des_colonnes(self, df):
tcol = df.columns.to_series().groupby(df.dtypes).groups
tycol = {k.name: v for k, v in tcol.items()}
return(self.tycol)
def Analyse_table(self, table):
# Renvoi un dico 'tycol' avec les types en clef et les noms des colonnes en valeur:
Types_des_colonnes(table)
nbr_types_colonnes_diff=len(tycol.keys())
type_table = table.dtypes
liste_columns = table.columns
clef_types= tycol.keys()
long_table = len(table)
nbr_cols = len(liste_columns)
print(table.describe())
print('Nombre de colonnes: '+ str(nbr_cols))
print('Nombre de types de colonnes différentes: '+str(nbr_types_colonnes_diff))
for kk in range(0,nbr_types_colonnes_diff):
print('Type: ' + tycol.keys()[kk])
print(tycol.values())
return(liste_columns)
def Analyse_colonne(self, col):
from numpy import where, nan
from pandas import isnull,core,DataFrame
# Si col est un dataframe:
if type(col) == core.frame.DataFrame:
dict_col = {}
for co in col.columns:
dict_col_Loc = Analyse_colonne(col[co]);
dict_col[co] = dict_col_Loc.values()
return(dict_col)
elif type(col) == core.series.Series:
type_col = type(col)
arr_null = where(isnull(col))[0]
type_data = col.dtype
col_uniq = col.unique()
nbr_unique= len(col_uniq)
taille_col= len(col)
nbr_ligne_vide= len(arr_null)
top_entree= col.head()
bottom_entree= col.tail()
pct_uniq= (float(nbr_unique)/float(taille_col))*100.0
pct_ligne_vide= (float(nbr_ligne_vide)/float(taille_col))*100.0
print('\n')
print(' ################# '+col.name+' #################')
print('Type des données: ' + str(type_data))
print('Taille de la colonne: ' + str(taille_col))
if nbr_unique == 1:
print('Aucune entrée unique')
else:
print('Nombre d\'uniques: '+ str(nbr_unique))
print('Pourcentage d\'uniques: '+str(pct_uniq)+' %')
if nbr_ligne_vide == 0:
print('Aucune ligne vide')
else:
print('Nombre de lignes vides: '+ str(nbr_ligne_vide))
print('Pourcentage de lignes vides: '+str(pct_ligne_vide)+' %')
dict_col = {}
dict_col[col.name] = arr_null
return(dict_col)
else:
print('Problem')
def main():
anly = Analyses()
anly.Analyse_table(df_AIS)
if __name__ == '__main__':
main()
When I run this script, I get a:
NameError: name 'tycol' is not defined
Which refers to the second line of:
def Analyse_table():
# Renvoi un dico 'tycol' avec les types en clef et les noms des colonnes en valeur:
Types_des_colonnes(table)
nbr_types_colonnes_diff=len(tycol.keys())
I know it has to do with using the 'self' properly, but I really don't understand how to do so properly. Could anybody show me how to solve this very easy problem?
(All the 'self' present in this script have been added by me only to try to make it work on my own.)
tycol, you must useself.tycolinstead.selfis used.