Python USES urllib2 to download the sample by crawling the seed on a web page
- 2020-04-02 13:25:43
- OfStack
Grasp the seeds through urllib2 and re modules
Train of thought
1. Log in the forum with the program (if you need to log in to access the section)
2. Access the specified section
3. Traverse posts (first take the specified page, then traverse the url of all posts on the page)
4. Cyclic access to all posts url, from the post page code to get the seed download address (through regular expressions or third-party page resolution library)
5. Visit the seeds page to download the seeds
import urllib
import urllib2
import cookielib
import re
import sys
import os
# site is website address | fid is part id
site = "http://xxx.yyy.zzz/"
source = "thread0806.php?fid=x&search=&page="
btSave = "./clyzwm/"
if os.path.isdir(btSave):
print btSave + " existing"
else:
os.mkdir(btSave)
logfile = "./clyzwm/down.log"
errorfile = "./clyzwm/error.log"
sucfile = "./clyzwm/sucess.log"
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',
'Referer' : 'http://xxx.yyy.zzz/'}
def btDown(url, dirPath):
logger(logfile, "download file : " + url)
try:
#pageCode = urllib2.urlopen(url).read()
#print pageCode
btStep1 = re.findall('http://[w]+.[w]+.[w]{0,4}/[w]{2,6}.php?[w]{2,6}=([w]+)', url, re.I)
#print btStep1
if len(btStep1)>0:
ref = btStep1[0]
downsite = ""
downData = {}
if len(ref)>20:
downsite = re.findall('http://www.[w]+.[w]+/', url)[0]
downsite = downsite + "download.php"
reff = re.findall('inputstype="hidden"sname="reff"svalue="([w=]+)"', urllib2.urlopen(url).read(), re.I)[0]
downData = {'ref': ref, 'reff':reff, 'submit':'download'}
else:
downsite = "http://www.downhh.com/download.php"
downData = {'ref': ref, 'rulesubmit':'download'}
#print "bt site - " + downsite + "n downData:"
#print downData
downData = urllib.urlencode(downData)
downReq = urllib2.Request(downsite, downData)
downReq.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36')
downPost = urllib2.urlopen(downReq)
stream = downPost.read(-1)
if (len(stream) > 1000):
downPost.close()
name = btStep1[0]+ ".torrent"
fw = open(dirPath + name, 'w')
fw.write(stream)
fw.close()
logger(sucfile, url+"n")
else:
logger(errorfile, url+"n")
except urllib2.URLError, e:
print e.reason
def logger(logfile, msg):
print msg
fw = open(logfile, 'a')
fw.write(msg)
fw.close()
for i in range(1, 1000):
logger(logfile, "nnn@ page " + str(i) + " ...")
part = site + source + str(i)
content = urllib2.urlopen(part).read()
content = content.decode('gbk').encode('utf8')
#print content
pages = re.findall('<as+href="(htm_data/[d]+/[d]+/[d]+.html).*?</a>', content,re.I)
#print pages
for page in pages:
page = site + page;
#logger(logfile, "n# visiting " + page + " ...")
pageCode = urllib2.urlopen(page).read()
#print pageCode
zzJump = re.findall('http://www.viidii.info/?http://[w]+/[w]+?[w]{2,6}=[w]+' ,pageCode)
#zzJump = re.findall('http://www.viidii.info/?http://[w/?=]*', pageCode)
if len(zzJump) > 0:
zzJump = zzJump[0]
#print "- jump page - " + zzJump
pageCode = urllib2.urlopen(page).read()
zzPage = re.findall('http://[w]+.[w]+.[w]+/link[w]?.php?[w]{2,6}=[w]+' ,pageCode)
if len(zzPage) > 0:
zzPage = zzPage[0]
logger(logfile, "n- zhongzi page -" + zzPage)
btDown(zzPage, btSave)
else:
logger(logfile, "n. NOT FOUND .")
else:
logger(logfile, "n... NOT FOUND ...")
zzPage = re.findall('http://[w]+.[w]+.[w]+/link[w]?.php?ref=[w]+' ,pageCode)