python uses xpath to crawl online data and store it in django model

  • 2021-09-12 01:34:27
  • OfStack

Help friends to make a website, need 1 product data information, because it is the agent of other companies' products, crawl the agent company's product data directly

1. Designing a database


from django.db import models
from uuslug import slugify
import uuid
import os


def products_directory_path(instance, filename):
  ext = filename.split('.')[-1]
  filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join('images', "products", instance.title, filename)


def product_relatedimage_directory_path(instance, filename):
  ext = filename.split('.')[-1]
  filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join('images', "product_relatedimage", instance.product.title, filename)


class ProductsCategory(models.Model):
  """ Product classification """
  name = models.CharField(' Product classification name ', max_length=80, unique=True)
  description = models.TextField(' Product classification description ', blank=True, null=True)
  slug = models.SlugField('slug', max_length=80, blank=True, null=True)
  parent_category = models.ForeignKey('self', verbose_name=" Parent classification ", blank=True, null=True, on_delete=models.CASCADE)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  def __str__(self):
    return self.name

  class Meta:
    ordering = ['name']
    verbose_name = " Product classification "
    verbose_name_plural = verbose_name


class ProductsTag(models.Model):
  """ Product label """
  name = models.CharField(' Product label name ', max_length=30, unique=True)
  slug = models.SlugField('slug', max_length=40)

  def __str__(self):
    return self.name

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  class Meta:
    ordering = ['name']
    verbose_name = " Product label "
    verbose_name_plural = verbose_name


class Product(models.Model):
  title = models.CharField(' Title ', max_length=255, unique=True)
  slug = models.SlugField('slug', max_length=255, blank=True, null=True)
  jscs = models.TextField(' Technical parameters ', blank=True, null=True)
  image = models.ImageField(upload_to=products_directory_path, verbose_name=" Product pictures ")
  views = models.PositiveIntegerField(' Page views ', default=0)
  category = models.ForeignKey('ProductsCategory', verbose_name=' Classification ', on_delete=models.CASCADE, blank=True, null=True)
  tags = models.ManyToManyField('ProductsTag', verbose_name=' Tag set ', blank=True)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.title)
    super().save(*args, **kwargs)

  def update_views(self):
    self.views += 1
    self.save(update_fields=['views'])

  def get_pre(self):
    return Product.objects.filter(id__lt=self.id).order_by('-id').first()

  def get_next(self):
    return Product.objects.filter(id__gt=self.id).order_by('id').first()

  def __str__(self):
    return self.title

  class Meta:
    verbose_name = " Products "
    verbose_name_plural = verbose_name


class ProductAdvantage(models.Model):
  content = models.TextField(' Product advantage ', blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.content

  class Meta:
    verbose_name = " Product advantage "
    verbose_name_plural = verbose_name


class ProductBody(models.Model):
  body = models.CharField(' Product content ', max_length=256, blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.product.title

  class Meta:
    verbose_name = " Product content "
    verbose_name_plural = verbose_name

2. Scripting

2.1 Write a function to get the source code of a web page


def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    return None

2.2 Get all product classification page links according to base page


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  Product classification url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  Deal with catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    print(url)

2.3 Get all corresponding product links according to the links on the product classification page


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  Product classification 
  catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
  print(" Product classification: " + catgory[0])
  #  Products under this category url
  urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
  #  Deal with url
  for url in urls:
    url = 'http://www.kexinjianji.com' + url
    print(url)
  print("=====================================================")

Combine the two to print out all product links


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  Product classification url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  Deal with catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    #  Product classification 
    catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    print(" Product classification: " + catgory[0])
    #  Products under this category url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    #  Deal with url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      print(url)
    print("=====================================================")

2.2 Use the xpath parsing function to return the content of the product link


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  Product name 
  title = tree.xpath('//*[@id="wrap"]//h1/text()')
  images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
  #  Product pictures 
  images_url = 'http://www.kexinjianji.com/' + images[0]
  #  Performance characteristics 
  xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
  #  Technical parameters 
  jscs = tree.xpath('//table')[0]
  jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
  #  Product content 
  cpnr = tree.xpath('//div[@class="describe"]/p')
  print(' Product name :' + title[0])
  print(' Product pictures :' + images_url)
  for td in xntd:
    print(' Performance characteristics :' + td)
  print(' Technical parameters :' + jscs_str)
  for cp in cpnr:
    # string(.)  Gets all text content under the current label 
    cp = cp.xpath('string(.)')
    print(' Product content :' + cp)
  print('============================================')

You can get all product information by combining three in one


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  Product classification url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  Deal with catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    #  Product classification 
    catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    #  Products under this category url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    #  Deal with url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        #  Product name 
        title = tree.xpath('//*[@id="wrap"]//h1/text()')
        images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
        #  Product pictures 
        images_url = 'http://www.kexinjianji.com' + images[0]
        #  Performance characteristics 
        xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
        #  Technical parameters 
        jscs = tree.xpath('//table')[0]
        jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
        #  Product content 
        cpnr = tree.xpath('//div[@class="describe"]/p')
        print(" Product classification: " + catgory[0])
        print(' Product link :' + url)
        print(' Product name :' + title[0])
        print(' Product pictures :' + images_url)
        for td in xntd:
          print(' Performance characteristics :' + td.strip())
        # print(' Technical parameters :' + jscs_str)
        for cp in cpnr:
          # string(.)  Gets all text content under the current label 
          cp = cp.xpath('string(.)')
          print(' Product content :' + cp)
        print('============================================')
      except Exception as e:
        print(e)
        print(' Error url:' + url)
        pass

3. Store to django model


import requests
from lxml.html import etree
import os
import django
import uuid
from django.core.files.base import ContentFile

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings")
django.setup()

from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage

url = 'http://www.kexinjianji.com/product/hzshntjbz_1/'


def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers, timeout=10)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    print('aa')
    return None


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  Product classification url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  Deal with catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    #  Product classification 
    p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    #  Products under this category url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    #  Deal with url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        #  Product name 
        title = tree.xpath('//*[@id="wrap"]//h1/text()')
        images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
        #  Product pictures 
        images_url = 'http://www.kexinjianji.com' + images[0]
        #  Performance characteristics 
        xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
        #  Technical parameters 
        jscs = tree.xpath('//table')[0]
        jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
        #  Product content 
        cpnr = tree.xpath('//div[@class="describe"]/p')
        #  Determine whether this category exists, and create a new one if it does not exist 
        catgory = p_catgory[0]
        products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
        if products_catgory:
          products_catgory = ProductsCategory.objects.get(name=catgory)
        else:
          products_catgory = ProductsCategory(name=catgory)
          products_catgory.save()
        print(products_catgory)

        #  Save product pictures 
        image_content = requests.get(url=images_url)
        ext = images_url.split('.')[-1] #  Get the picture type 
        filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) #  Randomly generate picture names 
        upload_image_file = ContentFile(image_content.content, name=filename) #  Save the picture as django Type 
        product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
        product.save()
        for td in xntd:
          product_advantage = ProductAdvantage()
          product_advantage.content = td
          product_advantage.product = product
          product_advantage.save()
        for cp in cpnr:
          cp = cp.xpath('string(.)')
          product_body = ProductBody()
          product_body.body = cp
          product_body.product = product
          product_body.save()
      except Exception as e:
        print(e)
        print(' Error url:' + url)

Finally, manually handle the error url (the technical parameter is not obtained on the page, and the technical parameter is 1 picture)

4. Summary

1. When xpath gets the tag content, span tag is nested in p tag, and the source code is as follows


<div class="describe" style="position: relative;"> 
   <p><span> Board    Width: </span>1500mm</p> 
   <p><span> Board    Thick: </span>4.5 mm</p> 
   <p><span> Discharge port: </span>6 Mouth </p> 
   <p><span> Heavy    Quantity: </span>6000 kg</p>
</div>

Use xpath to get p tag content
The results I want to get are as follows
Width: 1500mm
Plate thickness: 4.5 mm
Discharge port: 6 ports
Weight: 6000 kg
Using the following xpath can only be obtained separately, not the desired effect


//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()

Baidu found a solution later, using xpath ('string (.)')
1. Get all p tags first


def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    return None
0

2. Use **string (.) ** to get all text for all labels


def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    return None
1

Loop through all p tags


Related articles: