python uses pytesseract to realize local recognition of picture characters
- 2021-08-28 20:30:57
- OfStack
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import glob
from os import path
import os
import pytesseract
from PIL import Image
from queue import Queue
import threading
import datetime
import cv2
def convertimg(picfile, outdir):
''' Resize the picture and compress the oversized picture
picfile: Picture path
outdir : Picture output path
'''
img = Image.open(picfile)
width, height = img.size
while (width * height > 4000000): # The compressed picture of this value is about More than two hundred k
width = width // 2
height = height // 2
new_img = img.resize((width, height), Image.BILINEAR)
new_img.save(path.join(outdir, os.path.basename(picfile)))
def baiduOCR(ts_queue):
while not ts_queue.empty():
picfile = ts_queue.get()
filename = path.basename(picfile)
outfile = 'D:\Study\pythonProject\scrapy\IpProxy\port_zidian.txt'
img = cv2.imread(picfile, cv2.IMREAD_COLOR)
print(" Identifying pictures: \t" + filename)
message = pytesseract.image_to_string(img,lang = 'eng')
message = message.replace('', '')
message = message.replace('\n', '')
# message = client.basicAccurate(img) # Universal text high-precision recognition, every day 800 Sub-free
#print(" Recognition successful! "))
try:
filename1 = filename.split('.')[0]
filename1 = ''.join(filename1)
with open(outfile, 'a+') as fo:
fo.writelines('\'' + filename1 + '\'' + ':' + message + ',')
fo.writelines('\n')
# fo.writelines("+" * 60 + '\n')
# fo.writelines(" Identify pictures: \t" + filename + "\n" * 2)
# fo.writelines(" Text content: \n")
# # Output text content
# for text in message.get('words_result'):
# fo.writelines(text.get('words') + '\n')
# fo.writelines('\n' * 2)
os.remove(filename)
print(" Recognition successful! ")
except:
print(' Recognition failure ')
print(" Text export succeeded! ")
print()
def duqu_tupian(dir):
ts_queue = Queue(10000)
outdir = dir
# if path.exists(outfile):
# os.remove(outfile)
if not path.exists(outdir):
os.mkdir(outdir)
print(" Compress an oversized picture ...")
# Firstly, the image is compressed to improve the recognition speed, and the compressed image is saved in the temporary folder
try:
for picfile in glob.glob(r"D:\Study\pythonProject\scrapy\IpProxy\tmp\*"):
convertimg(picfile, outdir)
print(" Picture recognition ...")
for picfile in glob.glob("tmp1/*"):
ts_queue.put(picfile)
#baiduOCR(picfile, outfile)
#os.remove(picfile)
print(' Picture text extraction is over! The text output results are located in the file. ' )
#os.removedirs(outdir)
return ts_queue
except:
print(' Failure ')
if __name__ == "__main__":
start = datetime.datetime.now().replace(microsecond=0)
t = 'tmp1'
s = duqu_tupian(t)
threads = []
try:
for i in range(100):
t = threading.Thread(target=baiduOCR, name='th-' + str(i), kwargs={'ts_queue': s})
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
end = datetime.datetime.now().replace(microsecond=0)
print(' Delete time: ' + str(end - start))
except:
print(' Recognition failure ')
The measured speed is slow, but the speed is obviously improved by multithreading, but the accuracy is slightly lower. The same high-definition pictures have a 90% recognition rate. From time to time, there are garbled words and spaces, which can't be displayed here. Practice by yourself. Focus on free, identify casually, and lead to 100 pictures. It takes almost 6 minutes, and the speed is 1 time slower, but it is free, which is quite good.
The above is python using pytesseract to achieve local recognition of picture text details, more information about python recognition of picture text please pay attention to other related articles on this site!