python uses pytesseract to realize local recognition of picture characters

  • 2021-08-28 20:30:57
  • OfStack


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import glob
from os import path
import os
import pytesseract
from PIL import Image
from queue import Queue
import threading
import datetime
import cv2

def convertimg(picfile, outdir):
  ''' Resize the picture and compress the oversized picture 
  picfile:   Picture path 
  outdir :    Picture output path 
  '''
  img = Image.open(picfile)
  width, height = img.size
  while (width * height > 4000000): #  The compressed picture of this value is about   More than two hundred k
    width = width // 2
    height = height // 2
  new_img = img.resize((width, height), Image.BILINEAR)
  new_img.save(path.join(outdir, os.path.basename(picfile)))


def baiduOCR(ts_queue):
  while not ts_queue.empty():
    picfile = ts_queue.get()
    filename = path.basename(picfile)
    outfile = 'D:\Study\pythonProject\scrapy\IpProxy\port_zidian.txt'
    img = cv2.imread(picfile, cv2.IMREAD_COLOR)
    print(" Identifying pictures: \t" + filename)
    message = pytesseract.image_to_string(img,lang = 'eng')
    message = message.replace('', '')
    message = message.replace('\n', '')
    # message = client.basicAccurate(img)  #  Universal text high-precision recognition, every day  800  Sub-free 
    #print(" Recognition successful! "))
    try:
      filename1 = filename.split('.')[0]
      filename1 = ''.join(filename1)
      with open(outfile, 'a+') as fo:
        fo.writelines('\'' + filename1 + '\'' + ':' + message + ',')
        fo.writelines('\n')
        # fo.writelines("+" * 60 + '\n')
        # fo.writelines(" Identify pictures: \t" + filename + "\n" * 2)
        # fo.writelines(" Text content: \n")
        # #  Output text content 
        # for text in message.get('words_result'):
        #   fo.writelines(text.get('words') + '\n')
        # fo.writelines('\n' * 2)
      os.remove(filename)
      print(" Recognition successful! ")
    except:
      print(' Recognition failure ')



    print(" Text export succeeded! ")
    print()
def duqu_tupian(dir):
  ts_queue = Queue(10000)

  outdir = dir
  # if path.exists(outfile):
  #   os.remove(outfile)
  if not path.exists(outdir):
    os.mkdir(outdir)
  print(" Compress an oversized picture ...")
  #  Firstly, the image is compressed to improve the recognition speed, and the compressed image is saved in the temporary folder 
  try:
    for picfile in glob.glob(r"D:\Study\pythonProject\scrapy\IpProxy\tmp\*"):
      convertimg(picfile, outdir)
    print(" Picture recognition ...")
    for picfile in glob.glob("tmp1/*"):
      ts_queue.put(picfile)
      #baiduOCR(picfile, outfile)
      #os.remove(picfile)
    print(' Picture text extraction is over! The text output results are located in the file. ' )
    #os.removedirs(outdir)
    return ts_queue
  except:
    print(' Failure ')

if __name__ == "__main__":

  start = datetime.datetime.now().replace(microsecond=0)
  t = 'tmp1'
  s = duqu_tupian(t)
  threads = []
  try:
    for i in range(100):
      t = threading.Thread(target=baiduOCR, name='th-' + str(i), kwargs={'ts_queue': s})
      threads.append(t)
    for t in threads:
      t.start()
    for t in threads:
      t.join()
    end = datetime.datetime.now().replace(microsecond=0)
    print(' Delete time: ' + str(end - start))
  except:
    print(' Recognition failure ')

The measured speed is slow, but the speed is obviously improved by multithreading, but the accuracy is slightly lower. The same high-definition pictures have a 90% recognition rate. From time to time, there are garbled words and spaces, which can't be displayed here. Practice by yourself. Focus on free, identify casually, and lead to 100 pictures. It takes almost 6 minutes, and the speed is 1 time slower, but it is free, which is quite good.

The above is python using pytesseract to achieve local recognition of picture text details, more information about python recognition of picture text please pay attention to other related articles on this site!


Related articles: