- You have not described the purpose of
payload. If this is a JSON payload going to some other web service, Not found is a poor choice for a missing value and None would be more appropriate
- You never use
payload['store']
- Your selector
h1[class="product-page-header"] should just be h1.product-page-header
- I think your regex for
JetshopData is unnecessarily permissive. If the format breaks, you should be notified by a parse failure rather than silently letting a changed design through - since the outer dictionary format will likely not be the only thing to change
- You should constrain your regex to only looking in
<script> values rather than through the entire HTML document
'{}'.format is redundant
- Tell
requests when you're done with the response via context management; conversely, there is no benefit to del bs4 if you have proper method scope
- It's likely that you should be looking at all variations instead of just the first
- Don't blanket-
except. If you get a specific exception that you want to ignore that, ignore that in a narrow manner.
- Separate your scraping code from your payload formation code
The following suggested code uses BeautifulSoup because it's what I'm more familiar with and I didn't want to bother installing selectolax:
import json
import re
from dataclasses import dataclass
from pprint import pprint
from typing import Optional, List
import requests
from bs4 import BeautifulSoup
@dataclass
class Product:
name: Optional[str]
price: Optional[str]
image: Optional[str]
sizes: List[str]
@staticmethod
def get_sizes(doc: BeautifulSoup) -> List[str]:
pat = re.compile(
r'^<script>var JetshopData='
r'(\{.*\})'
r';</script>$',
)
for script in doc.find_all('script'):
match = pat.match(str(script))
if match is not None:
break
else:
return []
data = json.loads(match[1])
return [
variation
for get_value in data['ProductInfo']['Attributes']['Variations']
if get_value.get('IsBuyable')
for variation in get_value['Variation']
]
@classmethod
def from_page(cls, url: str) -> Optional['Product']:
with requests.get(url) as response:
if not response.ok:
return None
doc = BeautifulSoup(response.text, 'html.parser')
name = doc.select_one('h1.product-page-header')
price = doc.select_one('span.price')
image = doc.select_one('meta[property="og:image"]')
return cls(
name=name and name.text.strip(),
price=price and price.text.strip(),
image=image and image['content'],
sizes=cls.get_sizes(doc),
)
@property
def payload(self) -> dict:
return {
"name": self.name or "Not found",
"price": self.price or "Not found",
"image": self.image or "Not found",
"sizes": self.sizes,
}
def main():
product = Product.from_page("https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")
if product is None:
print('No new payload')
else:
print('New payload:')
pprint(product.payload)
if __name__ == '__main__':
main()