python crawler uses selenium+opencv to identify sliding verification and simulate logging in to Zhihu

2021-11-30 00:57:35
OfStack

Sliding verification distance

Obtain two photos of verification code background image and slider image respectively, and then use opencv library to process them by Gaussian blur and Canny algorithm, and then match the two images by matchTemplate method to obtain sliding distance. It should be noted that when the Zhihu verification code is operated, it needs to shift 10px distance to the right on the original basis


def get_distance(self, bg_img_path='./bg.png', slider_img_path='./slider.png'):
        """ Get the slider movement distance """

        #  Background image processing 
        bg_img = cv.imread(bg_img_path, 0)  #  Read in grayscale pictures 
        bg_img = cv.GaussianBlur(bg_img, (3, 3), 0)  #  Gaussian fuzzy denoising 
        bg_img = cv.Canny(bg_img, 50, 150)  # Canny Edge detection based on algorithm 
        #  The slider does the same 
        slider_img = cv.imread(slider_img_path, 0)
        slider_img = cv.GaussianBlur(slider_img, (3, 3), 0)
        slider_img = cv.Canny(slider_img, 50, 150)
        #  Find the best match 
        res = cv.matchTemplate(bg_img, slider_img, cv.TM_CCOEFF_NORMED)
        #  Minimum value, maximum value, and get the minimum value ,  Index of the maximum value 
        min_val, max_val, min_loc, max_loc = cv.minMaxLoc(res)
        #  For example: (-0.05772797390818596, 0.30968162417411804, (0, 0), (196, 1))
        top_left = max_loc[0]  #  Abscissa 
        return top_left

Motion trajectory of slider

Simulate human behavior. When reaching the notch position, continue to slide backward for 1 distance, and then retreat to the exact position


def get_tracks(self, distance):
        ''' Sliding trajectory  '''

        tracks = []
        v = 0
        t = 0.2  #  Unit time 
        current = 0  #  Current displacement of slider 
        distance += 10  #  Multi-movement 10px, And then back off 
        while current < distance:
            if current < distance * 5 / 8:
                a = random.randint(1, 3)
            else:
                a = -random.randint(2, 4)
            v0 = v  #  Initial velocity 
            track = v0 * t + 0.5 * a * (t ** 2)  #  Unit time ( 0.2s ) 
            tracks.append(round(track))  #  Join trajectory 
            current += round(track)
            v = v0 + a * t
        # Back to the approximate position 
        for i in range(5):
            tracks.append(-random.randint(1, 3))
        return tracks

Mouse sliding operation

Slide according to the sliding track through the mouse action chain in selenium


def mouse_move(self,slide,tracks):
        ''' Mouse sliding '''

        # Click the mouse on the slider and press No 
        ActionChains(self.driver).click_and_hold(slide).perform()
        # Slide according to the trajectory, 
        for track in tracks:
            ActionChains(self.driver).move_by_offset(track, 0).perform() 
        ActionChains(self.driver).release(slide).perform()

Avoid Zhihu selenium detection

When using selenium automated test to crawl Zhihu, it appeared: Error code 10001: Exception request please upgrade the client and try again. This error occurred because Zhihu can detect the script of selenium automated test

Use the remote debugging mode of chrome combined with selenium to remotely operate chrome for crawling, which will avoid selenium being detected by the website

Add environment variables

Add the directory of chrome. exe to the system environment variable, such as C:\ Program Files\ Google\ Chrome\ Application, so you can launch the browser directly by typing chrome. exe on the command line

Open the cmd window and execute the command


chrome.exe --remote-debugging-port=9222 --user-data-dir="E:\eliwang\selenium_data"

Note that the port is not occupied, user-data-dir is used to indicate the path of the configuration file, custom

The browser opens and a new tab opens

Main code for selenium takeover


options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")

Close the browser window

1. Use the close () method of the browser object, but the quit () method does not work.

2. Open and close manually

Complete landing code


# coding:utf-8

import cv2 as cv
import time
import random
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as WAIT
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from urllib.request import urlretrieve


class Zhihu_login:
    ''' Zhihu simulated landing '''

    def __init__(self):
        options = webdriver.ChromeOptions()
        # Manipulate chrome Browser 
        options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
        self.driver = webdriver.Chrome(options=options)
        self.wait = WAIT(self.driver, 5)
        self.url = 'https://www.zhihu.com/'
        self.bg_img_path = './bg.png'
        self.slider_img_path = './slider.png'


    def run(self):
        ''' Execution entry '''

        self.driver.get(self.url)
        try:
            if WAIT(self.driver,3).until(EC.presence_of_element_located((By.ID,'Popover15-toggle'))):
                print(' Successful login ')
                self.save_cookie()
                self.driver.close()
        except:
            #  Switch to password login 
            self.wait.until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class,"SignFlow-tabs")]/div[2]'))).click()
            name_input = self.driver.find_element_by_name('username')
            name_input.clear()
            name_input.send_keys(' Account number ')
            pass_input = self.driver.find_element_by_name('password')
            pass_input.clear()
            pass_input.send_keys(' Password ')
            self.wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@type="submit"]'))).click()  #  Click the login button 
            time.sleep(1)
            # Do sliding verification and try at most 5 Secondary revalidation 
            if self.slide_verify():
                print(' Successful login ')
                self.save_cookie()
                self.driver.close()
            else:
                print(' No. 1 1 Log-in failure ')
                for i in range(4):
                    print(' Trying to %d Second landing '%(i+2))
                    if self.slide_verify():
                        print(' No. 1 %d Successful landing '%(i+2))
                        self.save_cookie()
                        self.driver.close()
                        return
                    print(' No. 1 %d Log-in failure ' % (i + 2))
                print(' Log in failed 5 Time, stop logging in ')
                self.driver.close()


    def slide_verify(self):
        ''' Sliding verification '''

        slider_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="yidun_slider"]')))
        self.bg_img_url = self.wait.until(EC.presence_of_element_located((By.XPATH, '//img[@class="yidun_bg-img"]'))).get_attribute('src')  #  Get the background picture of verification code url
        self.slider_img_url = self.wait.until(EC.presence_of_element_located((By.XPATH, '//img[@class="yidun_jigsaw"]'))).get_attribute('src')  #  Get the verification code slider diagram url
        urlretrieve(self.bg_img_url, self.bg_img_path)
        urlretrieve(self.slider_img_url, self.slider_img_path)
        distance = self.get_distance(self.bg_img_path, self.slider_img_path)
        distance += 10  #  The actual moving distance needs to be shifted to the right 10px
        tracks = self.get_tracks(distance)
        self.mouse_move(slider_button,tracks)
        try:
            element = self.wait.until(EC.presence_of_element_located((By.ID,'Popover15-toggle')))
        except:
            return False
        else:
            return True

    def save_cookie(self):
        cookie = {}
        for item in self.driver.get_cookies():
            cookie[item['name']] = item['value']
        print(cookie)
        print(' After successfully landing in Zhihu, cookie Information ')


    def mouse_move(self,slide,tracks):
        ''' Mouse sliding '''

        # Click the mouse on the slider and press No 
        ActionChains(self.driver).click_and_hold(slide).perform()
        # Slide according to the trajectory, 
        for track in tracks:
            ActionChains(self.driver).move_by_offset(track, 0).perform()
        ActionChains(self.driver).release(slide).perform()


    def get_distance(self, bg_img_path='./bg.png', slider_img_path='./slider.png'):
        """ Get the slider movement distance """

        #  Background image processing 
        bg_img = cv.imread(bg_img_path, 0)  #  Read in grayscale pictures 
        bg_img = cv.GaussianBlur(bg_img, (3, 3), 0)  #  Gaussian fuzzy denoising 
        bg_img = cv.Canny(bg_img, 50, 150)  # Canny Edge detection based on algorithm 
        #  The slider does the same 
        slider_img = cv.imread(slider_img_path, 0)
        slider_img = cv.GaussianBlur(slider_img, (3, 3), 0)
        slider_img = cv.Canny(slider_img, 50, 150)
        #  Find the best match 
        res = cv.matchTemplate(bg_img, slider_img, cv.TM_CCOEFF_NORMED)
        #  Minimum value, maximum value, and get the minimum value ,  Index of the maximum value 
        min_val, max_val, min_loc, max_loc = cv.minMaxLoc(res)
        #  For example: (-0.05772797390818596, 0.30968162417411804, (0, 0), (196, 1))
        top_left = max_loc[0]  #  Abscissa 
        return top_left


    def get_tracks(self, distance):
        ''' Sliding trajectory  '''

        tracks = []
        v = 0
        t = 0.2  #  Unit time 
        current = 0  #  Current displacement of slider 
        distance += 10  #  Multi-movement 10px, And then back off 
        while current < distance:
            if current < distance * 5 / 8:
                a = random.randint(1, 3)
            else:
                a = -random.randint(2, 4)
            v0 = v  #  Initial velocity 
            track = v0 * t + 0.5 * a * (t ** 2)  #  Unit time ( 0.2s ) 
            tracks.append(round(track))  #  Join trajectory 
            current += round(track)
            v = v0 + a * t
        # Back to the approximate position 
        for i in range(5):
            tracks.append(-random.randint(1, 3))
        return tracks


if __name__ == '__main__':
    Zhihu_login().run()