Getting table data from web page using python beautifulsoup

Question

I have a webpage that displays some products, and I need to go to each of these products and obtain the table data under the tab called technical details, and get this data into one big table in excel. I wrote the following code, but I seem to get a blank excel file. Where is it going wrong?

import requests
import xlsxwriter
from bs4 import BeautifulSoup


def cpap_spider(url):
    global row_i
    
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    for link in soup.findAll('td', {'class': 'name name2_padd'}):
        href = link.get('href')
        title = link.string
        worksheet.write(row_i, 0, title)
        each_item(href)
        print(href)
        

def each_item(item_url):
    global cols_names, row_i
    
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    table = soup.find('table', {'class': 'width_table'})
    if table:
        table_rows = table.find_all('tr')
    else:
        return
    for row in table_rows:
      cols = row.select('td')
      for ele in range(0, len(cols)):
        temp = cols[ele].text.strip()
        if temp:
          if temp[-1] == ':':
            temp = temp[:-1]
          # Name of column
          if ele == 0:
            try:
              cols_names_i = cols_names.index(temp)
            except:
              cols_names.append(temp)
              cols_names_i = len(cols_names) -  1
              worksheet.write(0, cols_names_i + 1, temp)
              continue;
          worksheet.write(row_i, cols_names_i + 1, temp)      
    row_i += 1
    
cols_names = []
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('st.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, 'Title')
    
cpap_spider('https://www.respshop.com/cpap-machines/manual/')

workbook.close()

Andrej Kesely · Accepted Answer · 2020-07-13 10:27:44Z

The product info is loaded via Ajax from another URL.

This script will load all technical parameters along name/url of the product:

import re
import requests
import pandas as pd
from bs4 import BeautifulSoup


url = 'https://www.respshop.com/cpap-masks/nasal/'
product_info_url = 'https://www.respshop.com/product_info.php'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0'}

soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')

all_data = []
for item in soup.select('td.name a'):
    s = BeautifulSoup(requests.get(item['href'], headers=headers).content, 'html.parser')
    sku = s.select_one('[itemprop="mpn"]').text
    print(item.text, sku)
    products_id = re.search(r'p-(\d+)\.html', item['href'])[1]

    s = BeautifulSoup(requests.post(product_info_url, data={'products_id': products_id, 'tab': 3}, headers=headers).content, 'html.parser')

    row = {'Name': item.text, 'SKU': sku, 'URL': item['href']}
    for k, v in zip(s.select('#cont_3 td.main:nth-child(1)'),
                    s.select('#cont_3 td.main:nth-child(2)')):
        row[k.get_text(strip=True)] = v.get_text(strip=True)
    all_data.append(row)

df = pd.DataFrame(all_data)
df.to_csv('data.csv')

Prints:

ComfortGel Blue Nasal CPAP Mask - Philips Respironics  1070038, 1070037, 1070039, 1070040, 1070050, 1070051, 1070052, 1070049
Wisp Nasal Mask - Philips Respironics  1094051, 1094050, 1109298
Dreamwear Nasal Mask - Philips Respironics 1116700, 1116680, 1116681, 1116682, 1116683, 1116685, 1116686, 1116687, 1116688, 1116690, 1116691, 1116692, 1116693
Airfit N20 Nasal CPAP Mask by ResMed w/ 5 Free Cushions 63536, 63538, 63539
Airfit N30i - ResMed Nasal Mask  63800, 63801
New Respironics DreamWear Nasal Mask With Headgear Arm FitPack 1142376
ResMed AirFit N30 CPAP Nasal Cradle Mask 64222, 64223, 64224

...etc.

Creates data.csv (screenshot from LibreOffice):

@huy I added the import re. You need to load the product data from https://www.respshop.com/product_info.php with correct parameters.
ok got it! i also wanted a column that could get the tiny numbers printed on left as model: xxxx,.. for each product. I cant seem to locate its path for the code you wrote. Any suggestions?
this gets only maximum of 2 SKU numbers , even when there are multiple present for a product

Collectives™ on Stack Overflow

Getting table data from web page using python beautifulsoup

1 Answer 1

6 Comments

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

6 Comments

Related