Common anti crawling mechanisms in Python and their cracking methods

2021-11-10 10:00:52
OfStack

1. Common anti-crawling mechanisms and their cracking methods

Banning IP and using cookie have been mentioned in previous articles

Now I will mainly put the following:

~ Verification code
- > Text verification code- > OCR (Optical Character Recognition) > Interface/easyocr
If the program can't solve the problem by itself, you can consider using the 3-party interface (paid/free)
- > Behavioral verification code > Super Eagle
~ Mobile phone number + SMS verification code
- > Coding platform
~ Dynamic content
- > JavaScript Reverse-- > Locate the API interface that provides the data
- > Mobile phone grasping interface- > Pack Grab Tool (Charles/Fiddler)
- > Selenium directly simulates browser operation to obtain dynamic content
~ find_element_by_xxx / find_elements_by_xxx
~ page_source-- > Get the source code of a Web page containing dynamic content
- > JavaScript Encryption and Obfuscation Technology > Reading JavaScript is the premise of anti-crawling
~ Font anti-crawling/content from matting
- > Example

bytes-- > Invariant byte string > Binary- > BytesIO
str-- > Invariant string > Readable characters- > StringIO

2. Call 3-party API interface data (skyline data)


import requests

for page in range(1, 6):
    response = requests.get(
        'http://api.tianapi.com/topnews/index',
        params={
            'key': 'd5eace66dccd771e36767ce3563efa09',
            'page': page,
            'num': 20,
            'word': ' Huawei ',
            'src': ' People's Daily '
        }
    )
    result = response.json()
    for news in result['newslist']:
        print(news['title'])
        print(news['url'])

3. OCR (Optical Character Recognition) Library

easyocr Library with python


import easyocr
reader = easyocr.Reader(['ch_sim', 'en'], gpu=False)
print(reader.readtext('./files/captcha.jpg', detail=0))

Example: Automatic login of Alibaba Cloud mailbox


import io

import easyocr

from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
browser.set_window_size(1280, 960)
browser.get('http://mail.1000phone.com/')
#  Implicit wait (the following method waits if it can't be retrieved while working 10 Seconds) 
browser.implicitly_wait(10)
#  Explicit wait 
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, '.login_panel_iframe')))
iframe1 = browser.find_element_by_css_selector('.login_panel_iframe')
#  Record iframe1 Position of (relative position) 
x1, y1 = iframe1.location['x'], iframe1.location['y']
# Chrome Object's switch_to Property of frame Method, you can switch from the page to the iframe Medium 
browser.switch_to.frame(iframe1)
iframe2 = browser.find_element_by_css_selector('#ding-login-iframe')
x2, y2 = iframe2.location['x'], iframe2.location['y']
browser.switch_to.frame(iframe2)
username_input = browser.find_element_by_css_selector('#username')
#  Analog user input 
username_input.send_keys('xx@1000phone.com')
password_input = browser.find_element_by_css_selector('#password')
password_input.send_keys('xxxxx!!')
#  Create 1 Waiting objects 
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, '#login_checkcode_ico')))
captcha_img = browser.find_element_by_css_selector('#login_checkcode_ico')
# WebElement Object's size Attributes represent the width and height of the element, location Attribute represents the position of the element in the window 
size, location = captcha_img.size, captcha_img.location
x3, y3, width, height = location['x'], location['y'], size['width'], size['height']
#  Intercept the picture of the whole browser window to get the picture 2 Binary data 
image_data = browser.get_screenshot_as_png()
# bytes (Read-only byte string)  ----> io.BytesIO (Writable byte string) ---> getvalue() ---> bytes
# str (Read-only string)  ----> io.StringIO (Writable string) ---> getvalue() ---> str
browser_image = Image.open(io.BytesIO(image_data))
#  Cut out the picture of the verification code from the screenshot 
x, y = x1 + x2 + x3, y1 + y2 + y3
# Windows Systematic writing  --->  If there is something wrong with the screenshot, write the coordinates to death 
# print(x, y, width, height)
checkcode_image = browser_image.crop((x * 1.25, y * 1.25, (x + width) * 1.25, (y + height) * 1.25))
# macOS Systematic writing 
# checkcode_image = browser_image.crop((x * 2, y * 2, (x + width) * 2, (y + height) * 2))
checkcode_image.save('result.png')
#  Pass easyocr Do optical character recognition 
reader = easyocr.Reader(['en'], gpu=False)
code = reader.readtext('result.png', detail=0)[0]
#  Enter the recognized verification code into the text box 
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(code)
login_button = browser.find_element_by_css_selector('#login_submit_btn')
#  Simulate user clicks 
login_button.click()

4. Third Party Coding Platform (Super Eagle Coding Platform)

Add: Need to use python's own pillow library


"""
Pillow Library  ---> PIL ---> Python Image Library
"""
from PIL import Image, ImageFilter

#  Load image 
guido_image = Image.open('guido.jpg')
#  Tailoring 
guido_image.crop((80, 40, 310, 350)).show()
#  Filter 
guido_image.filter(ImageFilter.CONTOUR).show()
#  Thumbnail 
guido_image.thumbnail((125, 185))
#  Display image 
guido_image.show()

Write Super Eagle Coding Platform Class


from hashlib import md5

import requests


class ChaojiyingClient:

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def post_pic(self, image_data, code_type):
        """
        image_data:  Picture byte 
        code_type:  Verification code type   Reference  http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': code_type,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', image_data)}
        response = requests.post(
            url='http://upload.chaojiying.net/Upload/Processing.php',
            data=params,
            files=files,
            headers=self.headers
        )
        return response.json()

    #  Super Eagle error feedback function (only used to give feedback to Super Eagle platform) 
    def report_error(self, im_id):
        """
        im_id: Picture of error report ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    chaojiying = ChaojiyingClient(' Accounts ', ' Password x', 'ID')  #  User center >> Software ID  Generate 1 Substitution  96001
    with open('img.png', 'rb') as file:
        image_data = file.read()  #  Local picture file path   To replace  a.jpg  Sometimes WIN System requirements //
        print(chaojiying.post_pic(image_data, 1902))  # 1902  Verification code type    Official website >> Price system  3.4+ Version  print  Add after ()

Example: Automatic login of Alibaba Cloud mailbox using Super Eagle


import io

import easyocr

from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

from chaojiying import ChaojiyingClient

browser = webdriver.Chrome()
browser.set_window_size(1280, 960)
browser.get('http://mail.1000phone.com/')
#  Implicit wait (the following method waits if it can't be retrieved while working 10 Seconds) 
browser.implicitly_wait(10)
#  Explicit wait 
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, '.login_panel_iframe')))
iframe1 = browser.find_element_by_css_selector('.login_panel_iframe')
#  Record iframe1 Position of (relative position) 
x1, y1 = iframe1.location['x'], iframe1.location['y']
# Chrome Object's switch_to Property of frame Method, you can switch from the page to the iframe Medium 
browser.switch_to.frame(iframe1)
iframe2 = browser.find_element_by_css_selector('#ding-login-iframe')
x2, y2 = iframe2.location['x'], iframe2.location['y']
browser.switch_to.frame(iframe2)
username_input = browser.find_element_by_css_selector('#username')
#  Analog user input 
username_input.send_keys('xxxx.com')
password_input = browser.find_element_by_css_selector('#password')
password_input.send_keys('xxxx!!')
#  Create 1 Waiting objects 
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, '#login_checkcode_ico')))
captcha_img = browser.find_element_by_css_selector('#login_checkcode_ico')
# WebElement Object's size Attributes represent the width and height of the element, location Attribute represents the position of the element in the window 
size, location = captcha_img.size, captcha_img.location
x3, y3, width, height = location['x'], location['y'], size['width'], size['height']
#  Intercept the picture of the whole browser window to get the picture 2 Binary data 
image_data = browser.get_screenshot_as_png()
# bytes (Read-only byte string)  ----> io.BytesIO (Writable byte string) ---> getvalue() ---> bytes
# str (Read-only string)  ----> io.StringIO (Writable string) ---> getvalue() ---> str
browser_image = Image.open(io.BytesIO(image_data))
#  Cut out the picture of the verification code from the screenshot 
x, y = x1 + x2 + x3, y1 + y2 + y3
# Windows Systematic writing  --->  If there is something wrong with the screenshot, write the coordinates to death 
# print(x, y, width, height)
checkcode_image = browser_image.crop((x * 1.25, y * 1.25, (x + width) * 1.25, (y + height) * 1.25))
# macOS Systematic writing 
# checkcode_image = browser_image.crop((x * 2, y * 2, (x + width) * 2, (y + height) * 2))
checkcode_image.save('result.png')
#  Coding through Super Eagle Coding Platform 
chaojiying = ChaojiyingClient(' Accounts ', ' Password ', 'ID')
with open('result.png', 'rb') as file:
    image_data = file.read()
    result_dict = chaojiying.post_pic(image_data, 1902)
#  Enter the recognized verification code into the text box 
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(result_dict['pic_str'])
login_button = browser.find_element_by_css_selector('#login_submit_btn')
#  Simulate user clicks 
login_button.click()

5. Receive the mobile phone verification code through the code receiving platform (private SMS platform)

Receive verification code through private SMS platform (free)


import re

import bs4
import requests

pattern = re.compile(r'\d{4,6}')

resp = requests.get('https://www.yinsiduanxin.com/china-phone-number/verification-code-16521686439.html')
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
# print(resp.text)
td = soup.select_one('body > div.container > div:nth-child(4) > div:nth-child(3) > div.main > div.layui-row > table > tbody > tr:nth-child(1) > td:nth-child(2)')
results = pattern.findall(td.text)
print(results[0])