Python USES urllib2 to download the sample by crawling the seed on a web page

  • 2020-04-02 13:25:43
  • OfStack

Grasp the seeds through urllib2 and re modules

Train of thought

1. Log in the forum with the program (if you need to log in to access the section)

2. Access the specified section

3. Traverse posts (first take the specified page, then traverse the url of all posts on the page)

4. Cyclic access to all posts url, from the post page code to get the seed download address (through regular expressions or third-party page resolution library)

5. Visit the seeds page to download the seeds


import urllib
import urllib2
import cookielib
import re
import sys
import os
# site is website address | fid is part id
site = "http://xxx.yyy.zzz/"
source = "thread0806.php?fid=x&search=&page="
btSave = "./clyzwm/"
if os.path.isdir(btSave):
 print btSave + " existing"
else:
 os.mkdir(btSave)
logfile = "./clyzwm/down.log"
errorfile = "./clyzwm/error.log"
sucfile = "./clyzwm/sucess.log"
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',  
           'Referer' : 'http://xxx.yyy.zzz/'} 
def btDown(url, dirPath):
 logger(logfile, "download file : " + url)
 try:
  #pageCode = urllib2.urlopen(url).read()
  #print pageCode
  btStep1 = re.findall('http://[w]+.[w]+.[w]{0,4}/[w]{2,6}.php?[w]{2,6}=([w]+)', url, re.I)
  #print btStep1
  if len(btStep1)>0:
   ref = btStep1[0]
   downsite = ""
   downData = {}
   if len(ref)>20:
    downsite = re.findall('http://www.[w]+.[w]+/', url)[0]
    downsite = downsite + "download.php"
    reff = re.findall('inputstype="hidden"sname="reff"svalue="([w=]+)"', urllib2.urlopen(url).read(), re.I)[0]
    downData = {'ref': ref, 'reff':reff, 'submit':'download'}
   else:
    downsite = "http://www.downhh.com/download.php"
    downData = {'ref': ref, 'rulesubmit':'download'}
   #print "bt site - " +  downsite + "n downData:"
   #print downData
   downData = urllib.urlencode(downData)
   downReq = urllib2.Request(downsite, downData)
   downReq.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36')
   downPost = urllib2.urlopen(downReq)
   stream = downPost.read(-1)
   if (len(stream) > 1000):
    downPost.close()
    name = btStep1[0]+ ".torrent"
    fw = open(dirPath + name, 'w')
    fw.write(stream)
    fw.close()
    logger(sucfile, url+"n")
   else:
    logger(errorfile, url+"n")
 except urllib2.URLError, e:
  print e.reason
def logger(logfile, msg):
 print msg
 fw = open(logfile, 'a')
 fw.write(msg)
 fw.close()
for i in range(1, 1000):
 logger(logfile, "nnn@ page " + str(i) + " ...")
 part = site + source + str(i)

 content = urllib2.urlopen(part).read()
 content = content.decode('gbk').encode('utf8')
 #print content
 pages = re.findall('<as+href="(htm_data/[d]+/[d]+/[d]+.html).*?</a>', content,re.I)
 #print pages
 for page in pages:
  page = site + page;
  #logger(logfile, "n# visiting " + page + " ...")
  pageCode = urllib2.urlopen(page).read()
  #print pageCode
  zzJump = re.findall('http://www.viidii.info/?http://[w]+/[w]+?[w]{2,6}=[w]+' ,pageCode)  
  #zzJump = re.findall('http://www.viidii.info/?http://[w/?=]*', pageCode)
  if len(zzJump) > 0:
   zzJump = zzJump[0]
   #print "- jump page - " + zzJump
   pageCode = urllib2.urlopen(page).read()
   zzPage = re.findall('http://[w]+.[w]+.[w]+/link[w]?.php?[w]{2,6}=[w]+' ,pageCode)
   if len(zzPage) > 0:
    zzPage = zzPage[0]
    logger(logfile, "n- zhongzi page -" + zzPage)
    btDown(zzPage, btSave)
   else:
    logger(logfile, "n. NOT FOUND .")
  else:
   logger(logfile, "n... NOT FOUND ...")
  zzPage = re.findall('http://[w]+.[w]+.[w]+/link[w]?.php?ref=[w]+' ,pageCode)


Related articles: