Three ways to download images in bulk in python
- 2020-04-02 13:10:59
- OfStack
There are three ways: one is to use win32com, the extension library provided by Microsoft, to operate IE; the other is to use selenium's webdriver; and the third is to parse with HTMLParser, which comes with python. Win32com can get a document object similar to js, but seemingly read-only (the document is not found). Selenium provides support for Chrome, IE, FireFox, etc. Each browser has execute_script and find_element_by_xx methods that make it easy to execute js scripts (including modifying elements) and read elements in HTML. The downside is that selenium only provides support for python2.6 and 2.7. HTMLParser is a method that needs to write its own class to inherit from the base class and override the parsing elements. Personally, I found selenium more convenient to use and easy to manipulate elements in HTML.
The code is as follows:
Win32com:
# Slide the scroll bar to the bottom , Most sliding 20000 pixel
# Simulate the right keyboard to view more than one picture
import sys
import win32com.client,win32api
import urllib.request
import time
import os
def main():
# To obtain parameters
url=sys.argv[1]
# operation IE
ie=win32com.client.Dispatch("InternetExplorer.Application")
ie.Navigate(url)
ie.Visible=True
last_url=''
dir_name=''
while last_url!=url:
print('nThe URL is:',url,'n')
while ie.ReadyState != 4:
time.sleep(1)
while ie.Document.readyState != "complete":
time.sleep(1)
# Sliding scroll bar
win=ie.Document.parentWindow
lastY=-1;
for i in range(40):
win.scrollTo(0,500*i)
nowY=win.pageYOffset
if(nowY==lastY):
break
lastY=nowY
time.sleep(0.4)
print('Document load state:',ie.Document.readyState)
doc=ie.Document
# The first time you need to create a directory
if(dir_name==''):
root_dir='E:\img'
dir_name=root_dir+'\'+doc.title
dir_name=dir_name.replace('|','-')
if(os.path.exists(root_dir)!=True):
os.mkdir(root_dir)
if(os.path.exists(dir_name)!=True):
os.mkdir(dir_name)
all_image=doc.images
print(' A total of ',all_image.length,' image ')
count=0;
for img in all_image:
if(img.id=='b_img'):
count=count+1
print(count,img.src)
time.sleep(1)
img_file=urllib.request.urlopen(img.src)
byte=img_file.read()
print(count,'donwload complete!','-'*10,'size:','{:.3}'.format(byte.__len__()/1024),'KB')
if(byte.__len__()>7000):
file_name=img.src.replace('/','_')
file_name=file_name.replace(':','_')
end=file_name.__len__()
if(file_name.rfind('!')!=-1):
end=file_name.rfind('!')
if(file_name.rfind('?')!=-1):
end=file_name.rfind('?')
file_name=file_name[:end]
write_file=open(dir_name+'\'+file_name,'wb')
write_file.write(byte)
write_file.close()
print(count,file_name,'complete!')
# The next
last_url=url
win32api.keybd_event(39,0)
time.sleep(1)
url=ie.Document.url
print(last_url,url)
#ie.Quit()
if __name__ == '__main__':
main()
Selenium:
# -*- coding: cp936 -*-
import sys
import urllib
import time
import os
from selenium import webdriver
def main():
# To obtain parameters
url=sys.argv[1]
# operation IE
driver=webdriver.Chrome()
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Create a directory
dir_name=driver.find_element_by_tag_name('title').text
print dir_name
root_dir='E:\img'
dir_name=root_dir+'\'+dir_name
dir_name=dir_name.replace('|','-')
if(os.path.exists(root_dir)!=True):
os.mkdir(root_dir)
if(os.path.exists(dir_name)!=True):
os.mkdir(dir_name)
images=driver.find_elements_by_tag_name('img')
count=0
for image in images:
count=count+1
image_url=str(image.get_attribute('src'))
img_file=urllib.urlopen(image_url)
byte=img_file.read()
print count,'donwload complete!','-'*10,'size:',byte.__len__()/1024,'KB'
if(byte.__len__()>7000):
file_name=image_url.replace('/','_')
file_name=file_name.replace(':','_')
end=file_name.__len__()
if(file_name.rfind('!')!=-1):
end=file_name.rfind('!')
if(file_name.rfind('?')!=-1):
end=file_name.rfind('?')
file_name=file_name[:end]
write_file=open(dir_name+'\'+file_name,'wb')
write_file.write(byte)
write_file.close()
print count,file_name,'complete!'
driver.quit()
if __name__ == '__main__':
main()
HTMLParser:
# import modules used here -- sys is a very standard one
import sys
import urllib.request
# Gather our code in a main() function
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self,tag,attrs):
if(tag=='img'):
for attr in attrs:
if(attr[0]=='src'):
img_file=urllib.request.urlopen(attr[1])
byte=img_file.read()
# The file is larger than 1000b Generates a file , Add count, download how many images, display html code
if(byte.__len__()>1000):
file_name=attr[1].replace('/','_')
file_name=file_name.replace(':','_')
end=file_name.__len__()
if(file_name.rfind('!')!=-1):
end=file_name.rfind('!')
if(file_name.rfind('?')!=-1):
end=file_name.rfind('?')
file_name=file_name[:end]
## print(file_name)
write_file=open('E:\img\'+file_name,'wb')
write_file.write(byte)
write_file.close()
def main():
# To obtain parameters
url=sys.argv[1]
print('nThe URL is:',url,'n')
# read url The resource pointed to
html_file=urllib.request.urlopen(url)
byte_content=html_file.read()
# will html Save the page
url_file=open('E:\img\html\result.htm','wb')
url_file.write(byte_content)
url_file.close()
# Convert from byte to string
s=str(byte_content, encoding = "utf-8")
#print(s)
#bytes.decode(html_file.read())
parser=MyHTMLParser(strict=False)
parser.feed(s)
# Standard boilerplate to call the main() function to begin
# the program.
if __name__ == '__main__':
main()