Add OCR and image correction

This commit is contained in:
Oliver Traber 2024-07-07 00:30:13 +02:00
parent e74d3a4e27
commit e93117aeb4
Signed by: Bluemedia
GPG key ID: C0674B105057136C
13 changed files with 228 additions and 20 deletions

View file

View file

@ -0,0 +1,16 @@
from PIL import Image
import ocrmypdf
def create_pdf(scanner):
images = []
for page in scanner.get_pages():
img = Image.open(f"/var/www/html/img/{page.filename}")
a4im = Image.new('RGB',
(int(210 * 200 / 25.4), int(297 * 200 / 25.4)),
(255, 255, 255))
a4im.paste(img, img.getbbox())
images.append(a4im)
images[0].save("/var/www/html/img/out.pdf", save_all=True, append_images=images[1:])
def ocr_pdf():
ocrmypdf.ocr('/var/www/html/img/out.pdf', '/var/www/html/img/final.pdf')

View file

View file

@ -1,9 +1,11 @@
from pydantic import BaseModel
from typing import Optional
import app.scanner.enums as scan
class ScanPage(BaseModel):
filename: str
filename: Optional[str]
size_bytes: int
status: scan.PageStatus
class Config():
orm_mode = True

View file

@ -1,4 +1,4 @@
import threading
import threading, logging
from contextlib import asynccontextmanager
from typing import Annotated
@ -7,13 +7,25 @@ from fastapi import FastAPI, Depends
from app.data import models
from app.data.database import SessionLocal, engine
from uvicorn.logging import DefaultFormatter
from app.scanner.scanner import Scanner
from app.scanner.scanner import Status as ScannerStatus
# Set up logging
logger = logging.getLogger()
__syslog = logging.StreamHandler()
__syslog.setFormatter(DefaultFormatter(fmt="%(levelprefix)s %(message)s", use_colors=True))
logger.setLevel(logging.INFO)
logger.addHandler(__syslog)
# Create database
models.Base.metadata.create_all(bind=engine)
__scanner = Scanner("/var/www/html/img")
# Set up scanner instance
__scanner = Scanner("/var/www/html/img", logger)
# Preload scanner after FastAPI start
@asynccontextmanager
async def __lifespan(app: FastAPI):
threading.Thread(target=__scanner.preload).start()

View file

@ -8,6 +8,10 @@ class Status(Enum):
ERR_NO_PAPER = "err_no_paper"
ERR_COVER_OPEN = "err_cover_open"
class PageStatus(Enum):
PROCESSING = "processing"
DONE = "done"
class Setting(Enum):
PAPER_SOURCE = "source"
COLOR_MODE = "color"

View file

@ -0,0 +1,97 @@
import cv2
import numpy as np
def order_points(pts):
'''Rearrange coordinates to order:
top-left, top-right, bottom-right, bottom-left'''
rect = np.zeros((4, 2), dtype='float32')
pts = np.array(pts)
s = pts.sum(axis=1)
# Top-left point will have the smallest sum.
rect[0] = pts[np.argmin(s)]
# Bottom-right point will have the largest sum.
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
# Top-right point will have the smallest difference.
rect[1] = pts[np.argmin(diff)]
# Bottom-left will have the largest difference.
rect[3] = pts[np.argmax(diff)]
# return the ordered coordinates
return rect.astype('int').tolist()
def find_dest(pts):
(tl, tr, br, bl) = pts
# Finding the maximum width.
widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
maxWidth = max(int(widthA), int(widthB))
# Finding the maximum height.
heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
maxHeight = max(int(heightA), int(heightB))
# Final destination co-ordinates.
destination_corners = [[0, 0], [maxWidth, 0], [maxWidth, maxHeight], [0, maxHeight]]
return order_points(destination_corners)
def correct_image(img_path):
img = cv2.imread(img_path)
# Resize image to workable size
dim_limit = 1080
max_dim = max(img.shape)
if max_dim > dim_limit:
resize_scale = dim_limit / max_dim
img = cv2.resize(img, None, fx=resize_scale, fy=resize_scale)
# Create a copy of resized original image for later use
orig_img = img.copy()
# Repeated Closing operation to remove text from the document.
kernel = np.ones((5, 5), np.uint8)
img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel, iterations=3)
# GrabCut
mask = np.zeros(img.shape[:2], np.uint8)
bgdModel = np.zeros((1, 65), np.float64)
fgdModel = np.zeros((1, 65), np.float64)
rect = (20, 20, img.shape[1] - 20, img.shape[0] - 20)
cv2.grabCut(img, mask, rect, bgdModel, fgdModel, 5, cv2.GC_INIT_WITH_RECT)
mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')
img = img * mask2[:, :, np.newaxis]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (11, 11), 0)
# Edge Detection.
canny = cv2.Canny(gray, 0, 200)
canny = cv2.dilate(canny, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)))
# Finding contours for the detected edges.
contours, hierarchy = cv2.findContours(canny, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
# Keeping only the largest detected contour.
page = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
# Detecting Edges through Contour approximation.
# Loop over the contours.
corners = None
if len(page) == 0:
return orig_img
for c in page:
# Approximate the contour.
epsilon = 0.02 * cv2.arcLength(c, True)
corners = cv2.approxPolyDP(c, epsilon, True)
# If our approximated contour has four points.
if len(corners) == 4:
break
# Sorting the corners and converting them to desired shape.
#corners = sorted(corners)
# For 4 corner points being detected.
corners = order_points(corners)
destination_corners = find_dest(corners)
h, w = orig_img.shape[:2]
# Getting the homography.
M = cv2.getPerspectiveTransform(corners, destination_corners, cv2.DECOMP_LU)
# Perspective transform using homography.
final = cv2.warpPerspective(orig_img, M, (destination_corners[2][0], destination_corners[2][1]),
flags=cv2.INTER_LINEAR)
cv2.imwrite(img_path, final)

View file

@ -1,8 +1,12 @@
import gi, os, threading
from typing import List
import gi, os, threading, logging
from typing import List, Optional
from PIL import Image
from app.scanner.enums import Status
from app.scanner.enums import Status, PageStatus
from app.scanner.tesseract import Tesseract
from app.scanner.processing import correct_image
#from app.backends.common import create_pdf, ocr_pdf
gi.require_version('Libinsane', '1.0')
from gi.repository import Libinsane, GObject # type: ignore
@ -14,10 +18,17 @@ class __LibinsaneSilentLogger(GObject.GObject, Libinsane.Logger):
Libinsane.register_logger(__LibinsaneSilentLogger())
class Page:
filename: str
filename: Optional[str] = None
size_bytes: int
status: PageStatus
class Scanner:
def __init__(self, storage_path, logger = logging.getLogger()):
self.scanned_pages: List[Page] = []
self.logger = logger
self.tesseract = Tesseract(logger)
self.storage_path = storage_path
self.status = Status.INITIALIZED
def __get_device_id(self):
"""
@ -27,6 +38,7 @@ class Scanner:
:returns: Device id of the first scan device
"""
devs = self.api.list_devices(Libinsane.DeviceLocations.LOCAL_ONLY)
self.logger.info("Using device: %s", devs[0].get_dev_id())
return devs[0].get_dev_id()
def __raw_to_img(self, params, img_bytes):
@ -44,15 +56,24 @@ class Scanner:
def __write_file(self, scan_params, data, page_index, last_file):
data = b"".join(data)
if scan_params.get_format() == Libinsane.ImgFormat.RAW_RGB_24:
filesize = len(data)
img = self.__raw_to_img(scan_params, data)
filename = f"out{page_index}.png"
img.save(os.path.join(self.storage_path, filename), format="PNG")
page = Page()
page.filename = filename
page.status = PageStatus.PROCESSING
page.size_bytes = filesize
self.scanned_pages.append(page)
img = self.__raw_to_img(scan_params, data)
filename = f"out{page_index}.jpeg"
img = self.tesseract.rotate_img(img)
img_path = os.path.join(self.storage_path, filename)
img.save(img_path, format="jpeg", quality=95)
#correct_image(img_path)
page.filename = filename
page.status = PageStatus.DONE
self.scanned_pages[page_index] = page
if last_file:
#self.tesseract.create_pdf(scanner=self)
#ocr_pdf()
self.status = Status.DONE
def __set_defaults(self):
@ -61,23 +82,31 @@ class Scanner:
opts = {opt.get_name(): opt for opt in opts}
opts["sleeptimer"].set_value(1)
opts["resolution"].set_value(200)
opts["swcrop"].set_value(True)
opts["swdeskew"].set_value(True)
opts["page-height"].set_value(300)
opts["mode"].set_value("Color")
dev.close()
def __scan(self):
self.logger.info("Scan requested")
self.status = Status.RUNNING
source = self.api.get_device(self.device_id)
opts = source.get_options()
opts = {opt.get_name(): opt for opt in opts}
if opts["cover-open"].get_value() == True:
self.logger.warn("Cover open. Can't scan.")
self.status = Status.ERR_COVER_OPEN
return
self.logger.info("Starting scan...")
session = source.scan_start()
try:
page_index = 0
while not session.end_of_feed() and page_index < 50:
# Do not assume that all the pages will have the same size !
self.logger.info("Processing page %s", page_index)
# Do not assume that all the pages will have the same size
scan_params = session.get_scan_parameters()
img = []
while not session.end_of_page():
@ -88,16 +117,12 @@ class Scanner:
t.start()
page_index += 1
if page_index == 0:
self.logger.warn("No paper. Nothing to scan.")
self.status = Status.ERR_NO_PAPER
finally:
session.cancel()
source.close()
def __init__(self, storage_path):
self.scanned_pages: List[Page] = []
self.storage_path = storage_path
self.status = Status.INITIALIZED
def preload(self):
os.environ["LIBINSANE_NORMALIZER_SAFE_DEFAULTS"] = "0"
self.api = Libinsane.Api.new_safebet()

View file

@ -0,0 +1,34 @@
import logging
import pyocr
import pyocr.libtesseract
from PIL import Image
class Tesseract:
def __init__(self, logger = logging.getLogger()):
self.logger = logger
tools = pyocr.get_available_tools()
if len(tools) == 0:
logging.error("No OCR tool found")
self.tool = tools[1]
logging.info("Will use tool '%s'" % (self.tool.get_name()))
def rotate_img(self, image: Image.Image) -> Image.Image:
orientation = self.tool.detect_orientation(
image,
lang='deu'
)
logging.info("Tesseract: Rotate by %s degrees to correct (Confidence: %s)", orientation["angle"], orientation["confidence"])
return image.rotate(orientation["angle"], expand=True)
def create_pdf(self, scanner):
builder = pyocr.libtesseract.LibtesseractPdfBuilder()
builder.set_lang("deu")
builder.set_output_file("/var/www/html/img/out")
for page in scanner.get_pages():
filename = f"/var/www/html/img/{page.filename}"
self.logger.info(filename)
img = Image.open(filename)
builder.add_image(img)
builder.build()

View file

@ -16,6 +16,7 @@ pycairo==1.24.0
pydantic==1.10.12
pydantic_core==2.6.3
PyGObject==3.44.1
pytesseract==0.3.10
python-dateutil==2.8.2
python-dotenv==1.0.0
PyYAML==6.0.1

View file

@ -1,9 +1,10 @@
<script setup lang="ts">
import { ref } from 'vue';
import LoadingSpinner from '@/components/LoadingSpinner.vue';
import type { ScannedPage as ScannedPageType } from '@/types/scanner'
const props = defineProps({
imgUrl: String
scannedPage: ScannedPageType
})
const imgLoaded = ref(false)
@ -12,7 +13,7 @@ const imgLoaded = ref(false)
<div class="p-2">
<div class="w-full h-full rounded-lg shadow-lg bg-white flex justify-center items-center">
<LoadingSpinner v-if="!imgLoaded" class="w-10 h-10 text-gray-600" />
<img v-if="imgUrl" v-show="imgLoaded" :src="imgUrl" @load="imgLoaded=true" class="w-full h-full rounded-lg object-cover">
<img v-if="scannedPage.status === 'done'" v-show="imgLoaded" :src="'/img/' + scannedPage.filename" @load="imgLoaded=true" class="w-full h-full rounded-lg object-cover">
</div>
</div>
</template>

16
frontend/src/types/scanner.d.ts vendored Normal file
View file

@ -0,0 +1,16 @@
export interface ScannedPage {
filename?: string;
size_bytes: number;
status: "processing" | "done";
}
export interface ScanStatus {
pages: Array<ScannedPage>;
status:
| "initialized"
| "idle"
| "running"
| "done"
| "err_no_paper"
| "err_cover_open";
}

View file

@ -34,7 +34,7 @@ axios.post('/api/scan')
<template>
<dev class="w-full h-full flex flex-col">
<div class="w-full h-full p-2 flex flex-row flex-wrap overflow-auto">
<ScannedPage v-for="page in data.pages" :key="page.filename" class="w-1/5 h-1/2" :imgUrl="'/img/' + page.filename" />
<ScannedPage v-for="page in data.pages" :scannedPage="page" class="w-1/5 h-1/2" />
<ScannedPage v-if="data.status==='running'" class="w-1/5 h-1/2" />
</div>
<div class="w-full h-28 p-4 flex">