python script crawling font file implementation method
- 2020-05-30 20:30:47
- OfStack
preface
Everyone should have some experience, in order to improve the captcha recognition accuracy, of course, we must first get enough test data. The captchas were easy to download, but the need for human brains to manually recognize them was overwhelming, so I came up with a compromise -- build my own captchas.
In order to ensure diversity, first of all, of course, different fonts are needed. Just use a font file similar to ttf format. There are many fonts in ttf format available for download on the Internet. Of course, I will not be silly to manually download and decompress, decided to write a crawler.
Implementation method
Website 1: fontsquirrel.com
The site's fonts are free to download, but many of the download points are linked to other sites, which is ignored.
#coding:utf-8
import urllib2,cookielib,sys,re,os,zipfile
import numpy as np
# Website login
cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders=[('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36))')]
urllib2.install_opener(opener)
# Search for downloadable links
def search(path):
request=urllib2.Request(path)
response=urllib2.urlopen(request)
html=response.read()
html=html.replace('\n',' ')# Get rid of all carriage returns because the regular expression is a one-line match...
urls=re.findall(r'<a href="(.*?)" rel="external nofollow" >(.*?)</a>',html)
for i in urls:
url,inner=i
if not re.findall(r'Download ',inner)==[] and re.findall(r'offsite',inner)==[] and url not in items:
items.append(url)
items=[]# Save the download address
for i in xrange(15):
host='http://www.fontsquirrel.com/fonts/list/find_fonts/'+str(i*50)+'?filter%5Bdownload%5D=local'
search(host)
if not os.path.exists('ttf'):
os.mkdir('ttf')
os.chdir('ttf')
def unzip(rawfile,outputdir):
if zipfile.is_zipfile(rawfile):
print 'yes'
fz=zipfile.ZipFile(rawfile,'r')
for files in fz.namelist():
print(files) # print zip Archived directory
fz.extract(files,outputdir)# Unzip file
else:
print 'no'
for i in items:
print i
request=urllib2.Request('http://www.fontsquirrel.com'+i)
response=urllib2.urlopen(request)
html=response.read()
name=i.split('/')[-1]+'.zip'
f=open(name,'w')
f.write(html)
f.close()# Remember to close the file otherwise below unzip Will go wrong
unzip(name,'./')
os.remove(name)
os.listdir(os.getcwd())
os.chdir('../')
files=os.listdir('ttf/')
for i in files:# Delete unwanted files
if not (i.split('.')[-1]=='ttf' or i.split('.')[-1]=='otf'):
if os.path.isdir(i):
os.removedirs('ttf/'+i)
else:
os.remove('ttf/'+i)
print len(os.listdir('ttf/'))
Got 2000+ fonts, quite a variety, pretty good.
Website 2: dafont.com
This website font pattern is more, download is also more convenient, disgusting is his file name encoding seems to have a problem.
#coding:utf-8
import urllib2,cookielib,sys,re,os,zipfile
import shutil
import numpy as np
cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders=[('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36))')]
urllib2.install_opener(opener)
items=[]
def search(path):
request=urllib2.Request(path)
response=urllib2.urlopen(request)
html=response.read()
html=html.replace('\n',' ')
urls=re.findall(r'href=\"(http://dl.dafont.com/dl/\?f=.*?)\" >',html)
items.extend(urls)
for i in xrange(117):
host='http://www.dafont.com/new.php?page='+str(i+1)
search(host)
print 'Page'+str(i+1)+'done'
items=list(set(items))
print len(items)
if not os.path.exists('ttf2'):
os.mkdir('ttf2')
os.chdir('ttf2')
def unzip(rawfile,outputdir):
if zipfile.is_zipfile(rawfile):
print 'yes'
fz=zipfile.ZipFile(rawfile,'r')
for files in fz.namelist():
print(files) # print zip Archived directory
fz.extract(files,outputdir)
else:
print 'no'
for i in items:
print i
request=urllib2.Request(i)
response=urllib2.urlopen(request)
html=response.read()
name=i.split('=')[-1]+'.zip'
f=open(name,'w')
f.write(html)
f.close()
unzip(name,'./')
os.remove(name)
print os.listdir(os.getcwd())
for root ,dire,fis in os.walk('./'):# Recursively traversing folders
for i in fis:
if not (i.split('.')[-1]=='ttf' or i.split('.')[-1]=='otf'):
os.remove(root+i)
print i
for i in os.listdir('./'):
if os.path.isdir(i):
os.rmdir(i)
os.chdir('../')
The overall operation was similar to the previous one, running for a few 10 minutes and dropping more than 4000 fonts.
conclusion