Three ways to download images in bulk in python

  • 2020-04-02 13:10:59
  • OfStack

There are three ways: one is to use win32com, the extension library provided by Microsoft, to operate IE; the other is to use selenium's webdriver; and the third is to parse with HTMLParser, which comes with python. Win32com can get a document object similar to js, but seemingly read-only (the document is not found). Selenium provides support for Chrome, IE, FireFox, etc. Each browser has execute_script and find_element_by_xx methods that make it easy to execute js scripts (including modifying elements) and read elements in HTML. The downside is that selenium only provides support for python2.6 and 2.7. HTMLParser is a method that needs to write its own class to inherit from the base class and override the parsing elements. Personally, I found selenium more convenient to use and easy to manipulate elements in HTML.
The code is as follows:

Win32com:


# Slide the scroll bar to the bottom , Most sliding 20000 pixel 
# Simulate the right keyboard to view more than one picture 
import sys
import win32com.client,win32api
import urllib.request
import time
import os
def main():
    # To obtain parameters 
    url=sys.argv[1]
    # operation IE
    ie=win32com.client.Dispatch("InternetExplorer.Application")
    ie.Navigate(url)
    ie.Visible=True
    last_url=''
    dir_name=''
    while last_url!=url:
        print('nThe URL is:',url,'n')
        while ie.ReadyState != 4:    
            time.sleep(1)
        while ie.Document.readyState != "complete": 
            time.sleep(1)
        # Sliding scroll bar 
        win=ie.Document.parentWindow
        lastY=-1;
        for i in range(40):
            win.scrollTo(0,500*i)
            nowY=win.pageYOffset
            if(nowY==lastY):
                break
            lastY=nowY
            time.sleep(0.4)
        print('Document load state:',ie.Document.readyState)
        doc=ie.Document
        # The first time you need to create a directory 
        if(dir_name==''):
            root_dir='E:\img'
            dir_name=root_dir+'\'+doc.title
            dir_name=dir_name.replace('|','-')
            if(os.path.exists(root_dir)!=True):
                os.mkdir(root_dir)
            if(os.path.exists(dir_name)!=True):
                os.mkdir(dir_name)
        all_image=doc.images
        print(' A total of ',all_image.length,' image ')
        count=0;
        for img in all_image:
            if(img.id=='b_img'):
                count=count+1
                print(count,img.src)
                time.sleep(1)
                img_file=urllib.request.urlopen(img.src)
                byte=img_file.read()
                print(count,'donwload complete!','-'*10,'size:','{:.3}'.format(byte.__len__()/1024),'KB')
                if(byte.__len__()>7000):
                    file_name=img.src.replace('/','_')
                    file_name=file_name.replace(':','_')
                    end=file_name.__len__()
                    if(file_name.rfind('!')!=-1):
                        end=file_name.rfind('!')
                    if(file_name.rfind('?')!=-1):
                        end=file_name.rfind('?')
                    file_name=file_name[:end]
                    write_file=open(dir_name+'\'+file_name,'wb')
                    write_file.write(byte)
                    write_file.close()
                    print(count,file_name,'complete!')
        # The next 
        last_url=url
        win32api.keybd_event(39,0)
        time.sleep(1)
        url=ie.Document.url
        print(last_url,url)
    #ie.Quit()
if __name__ == '__main__':
    main()

Selenium:


# -*- coding: cp936 -*-
import sys
import urllib
import time
import os
from selenium import webdriver
def main():
    # To obtain parameters 
    url=sys.argv[1]
    # operation IE
    driver=webdriver.Chrome()
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Create a directory 
    dir_name=driver.find_element_by_tag_name('title').text
    print dir_name
    root_dir='E:\img'
    dir_name=root_dir+'\'+dir_name
    dir_name=dir_name.replace('|','-')
    if(os.path.exists(root_dir)!=True):
        os.mkdir(root_dir)
    if(os.path.exists(dir_name)!=True):
        os.mkdir(dir_name)
    images=driver.find_elements_by_tag_name('img')
    count=0
    for image in images:
        count=count+1
        image_url=str(image.get_attribute('src'))
        img_file=urllib.urlopen(image_url)
        byte=img_file.read()
        print count,'donwload complete!','-'*10,'size:',byte.__len__()/1024,'KB'
        if(byte.__len__()>7000):
            file_name=image_url.replace('/','_')
            file_name=file_name.replace(':','_')
            end=file_name.__len__()
            if(file_name.rfind('!')!=-1):
                end=file_name.rfind('!')
            if(file_name.rfind('?')!=-1):
                end=file_name.rfind('?')
            file_name=file_name[:end]
            write_file=open(dir_name+'\'+file_name,'wb')
            write_file.write(byte)
            write_file.close()
            print count,file_name,'complete!'
    driver.quit()
if __name__ == '__main__':
    main()

HTMLParser:


# import modules used here -- sys is a very standard one
import sys
import urllib.request
# Gather our code in a main() function
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
    def handle_starttag(self,tag,attrs):
        if(tag=='img'):
            for attr in attrs:
                if(attr[0]=='src'):
                    img_file=urllib.request.urlopen(attr[1])
                    byte=img_file.read()
                    # The file is larger than 1000b Generates a file , Add count, download how many images, display html code 
                    if(byte.__len__()>1000):
                        file_name=attr[1].replace('/','_')
                        file_name=file_name.replace(':','_')
                        end=file_name.__len__()
                        if(file_name.rfind('!')!=-1):
                            end=file_name.rfind('!')
                        if(file_name.rfind('?')!=-1):
                            end=file_name.rfind('?')
                        file_name=file_name[:end]
##                        print(file_name)
                        write_file=open('E:\img\'+file_name,'wb')
                        write_file.write(byte)
                        write_file.close()
def main():
    # To obtain parameters 
    url=sys.argv[1]
    print('nThe URL is:',url,'n')
    # read url The resource pointed to 
    html_file=urllib.request.urlopen(url)
    byte_content=html_file.read()
    # will html Save the page 
    url_file=open('E:\img\html\result.htm','wb')
    url_file.write(byte_content)
    url_file.close()
    # Convert from byte to string 
    s=str(byte_content, encoding = "utf-8")
    #print(s)
    #bytes.decode(html_file.read())
    parser=MyHTMLParser(strict=False)
    parser.feed(s)
# Standard boilerplate to call the main() function to begin
# the program.
if __name__ == '__main__':
    main()


Related articles: