ScanOS/backend/app/scanner/tesseract.py

35 lines
1.2 KiB
Python

import logging
import pyocr
import pyocr.libtesseract
from PIL import Image
class Tesseract:
def __init__(self, logger = logging.getLogger()):
self.logger = logger
tools = pyocr.get_available_tools()
if len(tools) == 0:
logging.error("No OCR tool found")
self.tool = tools[1]
logging.info("Will use tool '%s'" % (self.tool.get_name()))
def rotate_img(self, image: Image.Image) -> Image.Image:
orientation = self.tool.detect_orientation(
image,
lang='deu'
)
logging.info("Tesseract: Rotate by %s degrees to correct (Confidence: %s)", orientation["angle"], orientation["confidence"])
return image.rotate(orientation["angle"], expand=True)
def create_pdf(self, scanner):
builder = pyocr.libtesseract.LibtesseractPdfBuilder()
builder.set_lang("deu")
builder.set_output_file("/var/www/html/img/out")
for page in scanner.get_pages():
filename = f"/var/www/html/img/{page.filename}"
self.logger.info(filename)
img = Image.open(filename)
builder.add_image(img)
builder.build()