Example of Python crawler crawling NBA data function

  • 2020-10-07 18:45:18
  • OfStack

An example of Python is given to illustrate NBA's data crawling function. To share for your reference, the details are as follows:

The crawling site is ES5en-ES6en.com, which crawls the data of NBA from the 2016-2017 regular season to January 7, 2017

Change url_header and url_tail to crawl specific additional data.

The source code is as follows:


#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
import time
import urllib
from bs4 import BeautifulSoup
import re
from pyExcelerator import *
def getURLLists(url_header,url_tail,pages):
  """
   Get all pages URL The list of 
  """
  url_lists = []
  url_0 = url_header+'0'+url_tail
  print url_0
  url_lists.append(url_0)
  for i in range(1,pages+1):
    url_temp = url_header+str(i)+url_tail
    url_lists.append(url_temp)
  return url_lists
def getNBAAllData(url_lists):
  """
   Get all the 2017 season NBA Regular season data 
  """
  datasets = ['']
  for item in url_lists:
    data1 = getNBASingleData(item)
    datasets.extend(data1)
  # Remove empty elements from the data 
  for item in datasets[:]:
    if len(item) == 0:
      datasets.remove(item)
  return datasets
def getNBASingleData(url):
  """
   To obtain 1 A page NBA Regular season data 
  """
  # url = 'http://stat-nba.com/query_team.php?QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017'
  # html = requests.get(url).text
  html = urllib.urlopen(url).read()
  # print html
  soup = BeautifulSoup(html)
  data = soup.html.body.find('tbody').text
  list_data = data.split('\n')
  # with open('nba_data.txt','a') as fp:
  #   fp.write(data)
  # for item in list_data[:]:
  #   if len(item) == 0:
  #     list_data.remove(item)
  return list_data
def saveDataToExcel(datasets,sheetname,filename):
  book = Workbook()
  sheet = book.add_sheet(sheetname)
  sheet.write(0,0,u' The serial number ')
  sheet.write(0,1,u' The team ')
  sheet.write(0,2,u' time ')
  sheet.write(0,3,u' The results of ')
  sheet.write(0,4,u' host ')
  sheet.write(0,5,u' The game ')
  sheet.write(0,6,u' Field goal percentage ')
  sheet.write(0,7,u' Hit the number ')
  sheet.write(0,8,u' Number of shots ')
  sheet.write(0,9,u'3 shooting ')
  sheet.write(0,10,u'3 Points, hit a few ')
  sheet.write(0,11,u'3 Points to a few ')
  sheet.write(0,12,u' Free throw percentage ')
  sheet.write(0,13,u' Number of free throws made ')
  sheet.write(0,14,u' Free throw attempts ')
  sheet.write(0,15,u' rebounds ')
  sheet.write(0,16,u' Offensive rebounds ')
  sheet.write(0,17,u' Backcourt rebound ')
  sheet.write(0,18,u' assists ')
  sheet.write(0,19,u' steals ')
  sheet.write(0,20,u' blocks ')
  sheet.write(0,21,u' error ')
  sheet.write(0,22,u' A foul ')
  sheet.write(0,23,u' score ')
  num = 24
  row_cnt = 0
  data_cnt = 0
  data_len = len(datasets)
  print 'data_len:',data_len
  while(data_cnt< data_len):
    row_cnt += 1
    print ' The serial number :',row_cnt
    for col in range(num):
        # print col
        sheet.write(row_cnt,col,datasets[data_cnt])
        data_cnt += 1
  book.save(filename)
def writeDataToTxt(datasets):
  fp = open('nba_data.txt','w')
  line_cnt = 1
  for i in range(len(datasets)-1):
    # Action to align team names: if the team name is too short or is 76 Ers are   Add two after the name of the team table  Or add 1 a table
    if line_cnt % 24 == 2 and len(datasets[i]) < 5 or datasets[i] == u' Philadelphia 76 people ':
      fp.write(datasets[i]+'\t\t')
    else:
      fp.write(datasets[i]+'\t')
    line_cnt += 1
    if line_cnt % 24 == 1:
      fp.write('\n')
  fp.close()
if __name__ == "__main__":
  pages = int(1132/150)
  url_header = 'http://stat-nba.com/query_team.php?page='
  url_tail = '&QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017#label_show_result'
  url_lists = getURLLists(url_header,url_tail,pages)
  datasets = getNBAAllData(url_lists)
  writeDataToTxt(datasets)
  sheetname = 'nba normal data 2016-2017'
  str_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
  filename = 'nba_normal_data'+str_time+'.xls'
  saveDataToExcel(datasets,sheetname,filename)

More about Python related content to view this site project: the Python Socket programming skills summary ", "Python regular expression usage summary", "Python data structure and algorithm tutorial", "Python function using techniques", "Python string skills summary", "Python introduction and advanced tutorial" and "Python file and directory skills summary"

I hope this article has been helpful for Python programming.


Related articles: