Python regular match grab douban movie links and comment code sharing

  • 2020-04-02 13:16:47
  • OfStack


import urllib.request
import re
import time
def movie(movieTag):

    tagUrl=urllib.request.urlopen(url)
    tagUrl_read = tagUrl.read().decode('utf-8')
    return tagUrl_read
def subject(tagUrl_read):
    ''' 
         Here's the problem: 
         This is only for a single page to sort, not for the entire page of the film to sort 
         Next update to add movie links, consider adding movie posters 
         Need to append the list 
         Import to the local txt or excel In the 
         When matching the name of the movie, can you match the link with the name, score, comment array 
         6. 
    '''
# Regular expressions match movie names (links), ratings, and reviews     
    nameURL = re.findall(r'(http://movie.douban.com/subject/[0-9.]+)/"s+title="(.+)"',tagUrl_read)
    scoreURL = re.findall(r'<spans+class="rating_nums">([0-9.]+)</span>',tagUrl_read)
    evaluateURL = re.findall(r'<spans+class="pl">((w+) People evaluate )</span>',tagUrl_read)
    movieLists = list(zip(nameURL,scoreURL,evaluateURL))
    newlist.extend(movieLists)
    return newlist
# with quote Handles special (Chinese) characters 
movie_type = urllib.request.quote(input(' Please enter the movie type ( Such as plot, comedy, suspense ) : '))
page_end=int(input(' Please enter the page number at the end of the search: '))
num_end=page_end*20
num=0
page_num=1
newlist=[]
while num<num_end:
    url=r'http://movie.douban.com/tag/%s?start=%d'%(movie_type,num)
    movie_url = movie(url)
    subject_url=subject(movie_url)
    num=page_num*20
    page_num+=1
else:
    # use sorted The function sorts the list, reverse Parameters for True , default or False Is in descending order,  key=lambda I don't quite understand how this works 
    movieLIST = sorted(newlist, key=lambda movieList : movieList[1],reverse = True)
    for movie in movieLIST:
        print(movie)

time.sleep(3)
print(' The end of the ')


Related articles: