0

I am scraping a website and then storing the data into mysql, the code works fine but after sometime the it give the following error. I am using python 3.5.1 and pymysql to connect to database.

pymysql.err.OperationalError: (2013, 'Lost connection to MySQL server during query')

here is my code:

from bs4 import BeautifulSoup
import urllib.request
import re
import json
import pymysql
import pymysql.cursors


connection = pymysql.connect(host='XXX.XXX.XXX.XX',
                             user='XXX',
                             password='XXX',
                             db='XXX',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

r = urllib.request.urlopen('http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware')
soup = BeautifulSoup(r, "html.parser")

links = soup.find_all("a", href=re.compile(r"expexhibitorlist\.aspx\?categoryno=[0-9]+"))
linksfromcategories = ([link["href"] for link in links])

string = "http://i.cantonfair.org.cn/en/"
linksfromcategories = [string + x for x in linksfromcategories]


for link in linksfromcategories:

  response = urllib.request.urlopen(link)
  soup2 = BeautifulSoup(response, "html.parser")

  links2 = soup2.find_all("a", href=re.compile(r"\ExpExhibitorList\.aspx\?categoryno=[0-9]+"))
  linksfromsubcategories = ([link["href"] for link in links2])

  linksfromsubcategories = [string + x for x in linksfromsubcategories]
  for link in linksfromsubcategories:

        response = urllib.request.urlopen(link)
        soup3 = BeautifulSoup(response, "html.parser")
        links3 = soup3.find_all("a", href=re.compile(r"\ExpExhibitorList\.aspx\?categoryno=[0-9]+"))
        linksfromsubcategories2 = ([link["href"] for link in links3])

        linksfromsubcategories2 = [string + x for x in linksfromsubcategories2]
        for link in linksfromsubcategories2:

              response2 = urllib.request.urlopen(link)
              soup4 = BeautifulSoup(response2, "html.parser")
              companylink = soup4.find_all("a", href=re.compile(r"\expCompany\.aspx\?corpid=[0-9]+"))
              companylink = ([link["href"] for link in companylink])
              companydetail = soup4.find_all("div", id="contact")
              companylink = [string + x for x in companylink]
              my_list = list(set(companylink))

              for link in my_list:
                  print (link)
                  response3 = urllib.request.urlopen(link)
                  soup5 = BeautifulSoup(response3, "html.parser")
                  companydetail = soup5.find_all("div", id="contact")                      
                  for d in companydetail:
                        lis = d.find_all('li')
                        companyname = lis[0].get_text().strip()
                        companyaddress = lis[1].get_text().strip()
                        companycity = lis[2].get_text().strip()
                        try:
                            companypostalcode = lis[3].get_text().strip()
                            companypostalcode = companypostalcode.replace(",","")                                
                        except:
                            companypostalcode = lis[3].get_text().strip()
                        try:
                            companywebsite = lis[4].get_text().strip()
                            companywebsite = companywebsite.replace("\xEF\xBC\x8Cifl...","")
                        except IndexError:
                            companywebsite = 'null'


                        try:
                            with connection.cursor() as cursor:


                                print ('saving company details to db')
                                cursor.execute("""INSERT INTO company(
                                                                       companyname,address,city,pincode,website) 
                                                                   VALUES (%s, %s, %s, %s, %s)""",
                                                                   (companyname, companyaddress, companycity, 
                                                                    companypostalcode, companywebsite))
                            connection.commit()

                        finally:
                            print ("Company Data saved")
                  productlink = soup5.find_all("a", href=re.compile(r"\ExpProduct\.aspx\?corpid=[0-9]+.categoryno=[0-9]+"))
                  productlink = ([link["href"] for link in productlink])

                  productlink = [string + x for x in productlink]
                  productlinkun = list(set(productlink))
                  for link in productlinkun:

                      print (link)
                      responseproduct = urllib.request.urlopen(link)
                      soupproduct = BeautifulSoup(responseproduct, "html.parser")
                      productname = soupproduct.select('div[class="photolist"] li a')
                      for element in productname:
                          print ("====================Product Name=======================")
                          productnames = element.get_text().strip()
                          print (productnames)
                          try:
                              with connection.cursor() as cursor:

                                  # Create a new record
                                  print ('saving products to db')
                                  cursor.execute("""INSERT INTO products(
                                                                       companyname,products) 
                                                                   VALUES (%s, %s)""",
                                                                   (companyname, productnames))
                                  connection.commit()

                          finally:
                              print ("Products Data Saved")

Now I can't find out where my code is going wrong

2
  • first of all please hide your password and login Commented Mar 5, 2016 at 17:49
  • This code connect to a remote sql server. sometimes the network is lost, and the connection is interrupted. You should use the try/except/finally block, because finnaly does not handle the error. Commented Mar 5, 2016 at 17:58

1 Answer 1

3

hope it can help:

while True:  #it works until the data was not saved
    try:
        with connection.cursor() as cursor:


            print ('saving company details to db')
            cursor.execute("""INSERT INTO company(
                                                   companyname,address,city,pincode,website) 
                                               VALUES (%s, %s, %s, %s, %s)""",
                                               (companyname, companyaddress, companycity, 
                                                companypostalcode, companywebsite))
        connection.commit()
        break
    except OperationalError:
        connection.ping(True)
print ("Company Data saved")

You also can see similar question with using connection pool

or read the source

Sign up to request clarification or add additional context in comments.

3 Comments

it is giving now error 'ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host'
Did this work? I am running into the same issue. Looks to be caused when trying to commit data with over 500k rows.
@JeffB It worked. Maybe you are trying to make insert in only one transaction? If that, just split the data to small pieces.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.