I have a pretty huge dataset that needs to be uploaded for database (in my case Postgres). If I use standard to_sql with chunksize of 5000, there will be some bad lines because some data is corrupted. However if I would try to catch exception and iterate over each line to catch them, it would take quite a bit time, iterating over each line takes quite a bit of time.
What I try to do:
def insert_do_nothing_on_conflicts(sqltable, conn, keys, data_iter):
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import table, column
columns = []
for c in keys:
columns.append(column(c))
if sqltable.schema:
table_name = '{}.{}'.format(sqltable.schema, sqltable.name)
else:
table_name = sqltable.name
mytable = table(table_name, *columns)
insert_stmt = insert(mytable).values(list(data_iter))
do_nothing_stmt = insert_stmt.on_conflict_do_nothing(index_elements=['timestamp'])
conn.execute(do_nothing_stmt)
def write_to_db(filename):
chunksize = 10 ** 6
chunk_count=0
for chunk in pd.read_csv(filename, chunksize=chunksize,on_bad_lines='skip',header = None,encoding='windows-1251'):
while chunk_count<=5:
chunk_count+=1
continue
i=0
while i < len(chunk) + 1:
#try:
chunk[i:i+5000].to_sql(name='test_table_2',schema='kkt', con=engine_postgres, index=False, if_exists='append',method='insert_do_nothing_on_conflicts')
i+=5000
if i % 100000 ==0:
print (i//5000, (len(chunk)//5000)+2)
chunk_count+=1
But I receive error
ValueError: Invalid parameter method: insert_do_nothing_on_conflicts
Method should work and it presents in some code. It does not matter which method I put in code they all do not work.