python crawler uses selenium+opencv to identify sliding verification and simulate logging in to Zhihu
- 2021-11-30 00:57:35
- OfStack
Sliding verification distance
Obtain two photos of verification code background image and slider image respectively, and then use opencv library to process them by Gaussian blur and Canny algorithm, and then match the two images by matchTemplate method to obtain sliding distance. It should be noted that when the Zhihu verification code is operated, it needs to shift 10px distance to the right on the original basis
def get_distance(self, bg_img_path='./bg.png', slider_img_path='./slider.png'):
""" Get the slider movement distance """
# Background image processing
bg_img = cv.imread(bg_img_path, 0) # Read in grayscale pictures
bg_img = cv.GaussianBlur(bg_img, (3, 3), 0) # Gaussian fuzzy denoising
bg_img = cv.Canny(bg_img, 50, 150) # Canny Edge detection based on algorithm
# The slider does the same
slider_img = cv.imread(slider_img_path, 0)
slider_img = cv.GaussianBlur(slider_img, (3, 3), 0)
slider_img = cv.Canny(slider_img, 50, 150)
# Find the best match
res = cv.matchTemplate(bg_img, slider_img, cv.TM_CCOEFF_NORMED)
# Minimum value, maximum value, and get the minimum value , Index of the maximum value
min_val, max_val, min_loc, max_loc = cv.minMaxLoc(res)
# For example: (-0.05772797390818596, 0.30968162417411804, (0, 0), (196, 1))
top_left = max_loc[0] # Abscissa
return top_left
Motion trajectory of slider
Simulate human behavior. When reaching the notch position, continue to slide backward for 1 distance, and then retreat to the exact position
def get_tracks(self, distance):
''' Sliding trajectory '''
tracks = []
v = 0
t = 0.2 # Unit time
current = 0 # Current displacement of slider
distance += 10 # Multi-movement 10px, And then back off
while current < distance:
if current < distance * 5 / 8:
a = random.randint(1, 3)
else:
a = -random.randint(2, 4)
v0 = v # Initial velocity
track = v0 * t + 0.5 * a * (t ** 2) # Unit time ( 0.2s )
tracks.append(round(track)) # Join trajectory
current += round(track)
v = v0 + a * t
# Back to the approximate position
for i in range(5):
tracks.append(-random.randint(1, 3))
return tracks
Mouse sliding operation
Slide according to the sliding track through the mouse action chain in selenium
def mouse_move(self,slide,tracks):
''' Mouse sliding '''
# Click the mouse on the slider and press No
ActionChains(self.driver).click_and_hold(slide).perform()
# Slide according to the trajectory,
for track in tracks:
ActionChains(self.driver).move_by_offset(track, 0).perform()
ActionChains(self.driver).release(slide).perform()
Avoid Zhihu selenium detection
When using selenium automated test to crawl Zhihu, it appeared: Error code 10001: Exception request please upgrade the client and try again. This error occurred because Zhihu can detect the script of selenium automated test
Use the remote debugging mode of chrome combined with selenium to remotely operate chrome for crawling, which will avoid selenium being detected by the website
Add environment variables
Add the directory of chrome. exe to the system environment variable, such as C:\ Program Files\ Google\ Chrome\ Application, so you can launch the browser directly by typing chrome. exe on the command line
Open the cmd window and execute the command
chrome.exe --remote-debugging-port=9222 --user-data-dir="E:\eliwang\selenium_data"
Note that the port is not occupied, user-data-dir is used to indicate the path of the configuration file, custom
The browser opens and a new tab opens
Main code for selenium takeover
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
Close the browser window
1. Use the close () method of the browser object, but the quit () method does not work.
2. Open and close manually
Complete landing code
# coding:utf-8
import cv2 as cv
import time
import random
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as WAIT
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from urllib.request import urlretrieve
class Zhihu_login:
''' Zhihu simulated landing '''
def __init__(self):
options = webdriver.ChromeOptions()
# Manipulate chrome Browser
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
self.driver = webdriver.Chrome(options=options)
self.wait = WAIT(self.driver, 5)
self.url = 'https://www.zhihu.com/'
self.bg_img_path = './bg.png'
self.slider_img_path = './slider.png'
def run(self):
''' Execution entry '''
self.driver.get(self.url)
try:
if WAIT(self.driver,3).until(EC.presence_of_element_located((By.ID,'Popover15-toggle'))):
print(' Successful login ')
self.save_cookie()
self.driver.close()
except:
# Switch to password login
self.wait.until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class,"SignFlow-tabs")]/div[2]'))).click()
name_input = self.driver.find_element_by_name('username')
name_input.clear()
name_input.send_keys(' Account number ')
pass_input = self.driver.find_element_by_name('password')
pass_input.clear()
pass_input.send_keys(' Password ')
self.wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@type="submit"]'))).click() # Click the login button
time.sleep(1)
# Do sliding verification and try at most 5 Secondary revalidation
if self.slide_verify():
print(' Successful login ')
self.save_cookie()
self.driver.close()
else:
print(' No. 1 1 Log-in failure ')
for i in range(4):
print(' Trying to %d Second landing '%(i+2))
if self.slide_verify():
print(' No. 1 %d Successful landing '%(i+2))
self.save_cookie()
self.driver.close()
return
print(' No. 1 %d Log-in failure ' % (i + 2))
print(' Log in failed 5 Time, stop logging in ')
self.driver.close()
def slide_verify(self):
''' Sliding verification '''
slider_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="yidun_slider"]')))
self.bg_img_url = self.wait.until(EC.presence_of_element_located((By.XPATH, '//img[@class="yidun_bg-img"]'))).get_attribute('src') # Get the background picture of verification code url
self.slider_img_url = self.wait.until(EC.presence_of_element_located((By.XPATH, '//img[@class="yidun_jigsaw"]'))).get_attribute('src') # Get the verification code slider diagram url
urlretrieve(self.bg_img_url, self.bg_img_path)
urlretrieve(self.slider_img_url, self.slider_img_path)
distance = self.get_distance(self.bg_img_path, self.slider_img_path)
distance += 10 # The actual moving distance needs to be shifted to the right 10px
tracks = self.get_tracks(distance)
self.mouse_move(slider_button,tracks)
try:
element = self.wait.until(EC.presence_of_element_located((By.ID,'Popover15-toggle')))
except:
return False
else:
return True
def save_cookie(self):
cookie = {}
for item in self.driver.get_cookies():
cookie[item['name']] = item['value']
print(cookie)
print(' After successfully landing in Zhihu, cookie Information ')
def mouse_move(self,slide,tracks):
''' Mouse sliding '''
# Click the mouse on the slider and press No
ActionChains(self.driver).click_and_hold(slide).perform()
# Slide according to the trajectory,
for track in tracks:
ActionChains(self.driver).move_by_offset(track, 0).perform()
ActionChains(self.driver).release(slide).perform()
def get_distance(self, bg_img_path='./bg.png', slider_img_path='./slider.png'):
""" Get the slider movement distance """
# Background image processing
bg_img = cv.imread(bg_img_path, 0) # Read in grayscale pictures
bg_img = cv.GaussianBlur(bg_img, (3, 3), 0) # Gaussian fuzzy denoising
bg_img = cv.Canny(bg_img, 50, 150) # Canny Edge detection based on algorithm
# The slider does the same
slider_img = cv.imread(slider_img_path, 0)
slider_img = cv.GaussianBlur(slider_img, (3, 3), 0)
slider_img = cv.Canny(slider_img, 50, 150)
# Find the best match
res = cv.matchTemplate(bg_img, slider_img, cv.TM_CCOEFF_NORMED)
# Minimum value, maximum value, and get the minimum value , Index of the maximum value
min_val, max_val, min_loc, max_loc = cv.minMaxLoc(res)
# For example: (-0.05772797390818596, 0.30968162417411804, (0, 0), (196, 1))
top_left = max_loc[0] # Abscissa
return top_left
def get_tracks(self, distance):
''' Sliding trajectory '''
tracks = []
v = 0
t = 0.2 # Unit time
current = 0 # Current displacement of slider
distance += 10 # Multi-movement 10px, And then back off
while current < distance:
if current < distance * 5 / 8:
a = random.randint(1, 3)
else:
a = -random.randint(2, 4)
v0 = v # Initial velocity
track = v0 * t + 0.5 * a * (t ** 2) # Unit time ( 0.2s )
tracks.append(round(track)) # Join trajectory
current += round(track)
v = v0 + a * t
# Back to the approximate position
for i in range(5):
tracks.append(-random.randint(1, 3))
return tracks
if __name__ == '__main__':
Zhihu_login().run()