Add OCR and image correction

2025-07-28 07:22:32 +02:00 · 2024-07-07 00:30:13 +02:00 · 2024-07-07 00:30:13 +02:00 · e93117aeb4
commit e93117aeb4
parent e74d3a4e27
13 changed files with 228 additions and 20 deletions
--- a/backend/app/backends/init.py
+++ b/backend/app/backends/init.py
--- a/backend/app/backends/common.py
+++ b/backend/app/backends/common.py
@ -0,0 +1,16 @@
 from PIL import Image
 import ocrmypdf
 def create_pdf(scanner):
    images = []
    for page in scanner.get_pages():
        img = Image.open(f"/var/www/html/img/{page.filename}")
        a4im = Image.new('RGB',
                 (int(210 * 200 / 25.4), int(297 * 200 / 25.4)),
                 (255, 255, 255))
        a4im.paste(img, img.getbbox())
        images.append(a4im)
    images[0].save("/var/www/html/img/out.pdf", save_all=True, append_images=images[1:])
 def ocr_pdf():
    ocrmypdf.ocr('/var/www/html/img/out.pdf', '/var/www/html/img/final.pdf')
--- a/backend/app/backends/email.py
+++ b/backend/app/backends/email.py
--- a/backend/app/data/schemas.py
+++ b/backend/app/data/schemas.py
@ -1,9 +1,11 @@
 from pydantic import BaseModel
 from typing import Optional
 import app.scanner.enums as scan
 class ScanPage(BaseModel):
-    filename: str
+    filename: Optional[str]
    size_bytes: int
    status: scan.PageStatus
    class Config():
        orm_mode = True
--- a/backend/app/main.py
+++ b/backend/app/main.py
@ -1,4 +1,4 @@
-import threading
+import threading, logging
 from contextlib import asynccontextmanager
 from typing import Annotated
@ -7,13 +7,25 @@ from fastapi import FastAPI, Depends
 from app.data import models
 from app.data.database import SessionLocal, engine
 from uvicorn.logging import DefaultFormatter
 from app.scanner.scanner import Scanner
 from app.scanner.scanner import Status as ScannerStatus
 # Set up logging
 logger = logging.getLogger()
 __syslog = logging.StreamHandler()
 __syslog.setFormatter(DefaultFormatter(fmt="%(levelprefix)s %(message)s", use_colors=True))
 logger.setLevel(logging.INFO)
 logger.addHandler(__syslog)
 # Create database
 models.Base.metadata.create_all(bind=engine)
-__scanner = Scanner("/var/www/html/img")
+# Set up scanner instance
 __scanner = Scanner("/var/www/html/img", logger)
 # Preload scanner after FastAPI start
@asynccontextmanager
 async def __lifespan(app: FastAPI):
    threading.Thread(target=__scanner.preload).start()
--- a/backend/app/scanner/enums.py
+++ b/backend/app/scanner/enums.py
@ -8,6 +8,10 @@ class Status(Enum):
    ERR_NO_PAPER = "err_no_paper"
    ERR_COVER_OPEN = "err_cover_open"
 class PageStatus(Enum):
    PROCESSING = "processing"
    DONE = "done"
 class Setting(Enum):
    PAPER_SOURCE = "source"
    COLOR_MODE = "color"
--- a/backend/app/scanner/processing.py
+++ b/backend/app/scanner/processing.py
@ -0,0 +1,97 @@
 import cv2
 import numpy as np
 def order_points(pts):
    '''Rearrange coordinates to order:
      top-left, top-right, bottom-right, bottom-left'''
    rect = np.zeros((4, 2), dtype='float32')
    pts = np.array(pts)
    s = pts.sum(axis=1)
    # Top-left point will have the smallest sum.
    rect[0] = pts[np.argmin(s)]
    # Bottom-right point will have the largest sum.
    rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    # Top-right point will have the smallest difference.
    rect[1] = pts[np.argmin(diff)]
    # Bottom-left will have the largest difference.
    rect[3] = pts[np.argmax(diff)]
    # return the ordered coordinates
    return rect.astype('int').tolist()
 def find_dest(pts):
    (tl, tr, br, bl) = pts
    # Finding the maximum width.
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    maxWidth = max(int(widthA), int(widthB))
    # Finding the maximum height.
    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    maxHeight = max(int(heightA), int(heightB))
    # Final destination co-ordinates.
    destination_corners = [[0, 0], [maxWidth, 0], [maxWidth, maxHeight], [0, maxHeight]]
    return order_points(destination_corners)
 def correct_image(img_path):
    img = cv2.imread(img_path)
    # Resize image to workable size
    dim_limit = 1080
    max_dim = max(img.shape)
    if max_dim > dim_limit:
        resize_scale = dim_limit / max_dim
        img = cv2.resize(img, None, fx=resize_scale, fy=resize_scale)
    # Create a copy of resized original image for later use
    orig_img = img.copy()
    # Repeated Closing operation to remove text from the document.
    kernel = np.ones((5, 5), np.uint8)
    img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel, iterations=3)
    # GrabCut
    mask = np.zeros(img.shape[:2], np.uint8)
    bgdModel = np.zeros((1, 65), np.float64)
    fgdModel = np.zeros((1, 65), np.float64)
    rect = (20, 20, img.shape[1] - 20, img.shape[0] - 20)
    cv2.grabCut(img, mask, rect, bgdModel, fgdModel, 5, cv2.GC_INIT_WITH_RECT)
    mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')
    img = img * mask2[:, :, np.newaxis]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (11, 11), 0)
    # Edge Detection.
    canny = cv2.Canny(gray, 0, 200)
    canny = cv2.dilate(canny, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)))
    # Finding contours for the detected edges.
    contours, hierarchy = cv2.findContours(canny, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
    # Keeping only the largest detected contour.
    page = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
    # Detecting Edges through Contour approximation.
    # Loop over the contours.
    corners = None
    if len(page) == 0:
        return orig_img
    for c in page:
        # Approximate the contour.
        epsilon = 0.02 * cv2.arcLength(c, True)
        corners = cv2.approxPolyDP(c, epsilon, True)
        # If our approximated contour has four points.
        if len(corners) == 4:
            break
    # Sorting the corners and converting them to desired shape.
    #corners = sorted(corners)
    # For 4 corner points being detected.
    corners = order_points(corners)
    destination_corners = find_dest(corners)
    h, w = orig_img.shape[:2]
    # Getting the homography.
    M = cv2.getPerspectiveTransform(corners, destination_corners, cv2.DECOMP_LU)
    # Perspective transform using homography.
    final = cv2.warpPerspective(orig_img, M, (destination_corners[2][0], destination_corners[2][1]),
                                flags=cv2.INTER_LINEAR)
    cv2.imwrite(img_path, final)
--- a/backend/app/scanner/scanner.py
+++ b/backend/app/scanner/scanner.py
@ -1,8 +1,12 @@
-import gi, os, threading
+import gi, os, threading, logging
-from typing import List
+from typing import List, Optional
 from PIL import Image
-from app.scanner.enums import Status
+from app.scanner.enums import Status, PageStatus
 from app.scanner.tesseract import Tesseract
 from app.scanner.processing import correct_image
 #from app.backends.common import create_pdf, ocr_pdf
 gi.require_version('Libinsane', '1.0')
 from gi.repository import Libinsane, GObject # type: ignore
@ -14,10 +18,17 @@ class __LibinsaneSilentLogger(GObject.GObject, Libinsane.Logger):
 Libinsane.register_logger(__LibinsaneSilentLogger())
 class Page:
-    filename: str
+    filename: Optional[str] = None
    size_bytes: int
    status: PageStatus
 class Scanner:
    def __init__(self, storage_path, logger = logging.getLogger()):
        self.scanned_pages: List[Page] = []
        self.logger = logger
        self.tesseract = Tesseract(logger)
        self.storage_path = storage_path
        self.status = Status.INITIALIZED
    def __get_device_id(self):
        """
@ -27,6 +38,7 @@ class Scanner:
        :returns: Device id of the first scan device
        """
        devs = self.api.list_devices(Libinsane.DeviceLocations.LOCAL_ONLY)
        self.logger.info("Using device: %s", devs[0].get_dev_id())
        return devs[0].get_dev_id()
    def __raw_to_img(self, params, img_bytes):
@ -44,15 +56,24 @@ class Scanner:
    def __write_file(self, scan_params, data, page_index, last_file):
        data = b"".join(data)
        if scan_params.get_format() == Libinsane.ImgFormat.RAW_RGB_24:
            filesize = len(data)
            img = self.__raw_to_img(scan_params, data)
            filename = f"out{page_index}.png"
            img.save(os.path.join(self.storage_path, filename), format="PNG")
            page = Page()
-            page.filename = filename
+            page.status = PageStatus.PROCESSING
            page.size_bytes = filesize
            self.scanned_pages.append(page)
            img = self.__raw_to_img(scan_params, data)
            filename = f"out{page_index}.jpeg"
            img = self.tesseract.rotate_img(img)
            img_path = os.path.join(self.storage_path, filename)
            img.save(img_path, format="jpeg", quality=95)
            #correct_image(img_path)
            page.filename = filename
            page.status = PageStatus.DONE
            self.scanned_pages[page_index] = page
        if last_file:
            #self.tesseract.create_pdf(scanner=self)
            #ocr_pdf()
            self.status = Status.DONE
    def __set_defaults(self):
@ -61,23 +82,31 @@ class Scanner:
        opts = {opt.get_name(): opt for opt in opts}
        opts["sleeptimer"].set_value(1)
        opts["resolution"].set_value(200)
        opts["swcrop"].set_value(True)
        opts["swdeskew"].set_value(True)
        opts["page-height"].set_value(300)
        opts["mode"].set_value("Color")
        dev.close()
    def __scan(self):
        self.logger.info("Scan requested")
        self.status = Status.RUNNING
        source = self.api.get_device(self.device_id)
        opts = source.get_options()
        opts = {opt.get_name(): opt for opt in opts}
        if opts["cover-open"].get_value() == True:
            self.logger.warn("Cover open. Can't scan.")
            self.status = Status.ERR_COVER_OPEN
            return
        self.logger.info("Starting scan...")
        session = source.scan_start()
        try:
            page_index = 0
            while not session.end_of_feed() and page_index < 50:
-                # Do not assume that all the pages will have the same size !
+                self.logger.info("Processing page %s", page_index)
                # Do not assume that all the pages will have the same size
                scan_params = session.get_scan_parameters()
                img = []
                while not session.end_of_page():
@ -88,15 +117,11 @@ class Scanner:
                t.start()
                page_index += 1
            if page_index == 0:
                self.logger.warn("No paper. Nothing to scan.")
                self.status = Status.ERR_NO_PAPER
        finally:
            session.cancel()
            source.close()
    def __init__(self, storage_path):
        self.scanned_pages: List[Page] = []
        self.storage_path = storage_path
        self.status = Status.INITIALIZED
    def preload(self):
        os.environ["LIBINSANE_NORMALIZER_SAFE_DEFAULTS"] = "0"
--- a/backend/app/scanner/tesseract.py
+++ b/backend/app/scanner/tesseract.py
@ -0,0 +1,34 @@
 import logging
 import pyocr
 import pyocr.libtesseract
 from PIL import Image
 class Tesseract:
    def __init__(self, logger = logging.getLogger()):
        self.logger = logger
        tools = pyocr.get_available_tools()
        if len(tools) == 0:
            logging.error("No OCR tool found")
        self.tool = tools[1]
        logging.info("Will use tool '%s'" % (self.tool.get_name()))
    def rotate_img(self, image: Image.Image) -> Image.Image:
        orientation = self.tool.detect_orientation(
            image,
            lang='deu'
        )
        logging.info("Tesseract: Rotate by %s degrees to correct (Confidence: %s)", orientation["angle"], orientation["confidence"])
        return image.rotate(orientation["angle"], expand=True)
    def create_pdf(self, scanner):
        builder = pyocr.libtesseract.LibtesseractPdfBuilder()
        builder.set_lang("deu")
        builder.set_output_file("/var/www/html/img/out")
        for page in scanner.get_pages():
            filename = f"/var/www/html/img/{page.filename}"
            self.logger.info(filename)
            img = Image.open(filename)
            builder.add_image(img)
        builder.build()
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -16,6 +16,7 @@ pycairo==1.24.0
 pydantic==1.10.12
 pydantic_core==2.6.3
 PyGObject==3.44.1
 pytesseract==0.3.10
 python-dateutil==2.8.2
 python-dotenv==1.0.0
 PyYAML==6.0.1
--- a/frontend/src/components/ScannedPage.vue
+++ b/frontend/src/components/ScannedPage.vue
@ -1,9 +1,10 @@
 <script setup lang="ts">
 import { ref } from 'vue';
 import LoadingSpinner from '@/components/LoadingSpinner.vue';
 import type { ScannedPage as ScannedPageType } from '@/types/scanner'
 const props = defineProps({
-    imgUrl: String
+    scannedPage: ScannedPageType
 })
 const imgLoaded = ref(false)
@ -12,7 +13,7 @@ const imgLoaded = ref(false)
    <div class="p-2">
        <div class="w-full h-full rounded-lg shadow-lg bg-white flex justify-center items-center">
            <LoadingSpinner v-if="!imgLoaded" class="w-10 h-10 text-gray-600" />
-            <img v-if="imgUrl" v-show="imgLoaded" :src="imgUrl" @load="imgLoaded=true" class="w-full h-full rounded-lg object-cover">
+            <img v-if="scannedPage.status === 'done'" v-show="imgLoaded" :src="'/img/' + scannedPage.filename" @load="imgLoaded=true" class="w-full h-full rounded-lg object-cover">
        </div>
    </div>
 </template>
--- a/frontend/src/types/scanner.d.ts
+++ b/frontend/src/types/scanner.d.ts
@ -0,0 +1,16 @@
 export interface ScannedPage {
    filename?: string;
 	size_bytes: number;
 	status: "processing" | "done";
 }
 export interface ScanStatus {
    pages: Array<ScannedPage>;
    status:
      | "initialized"
      | "idle"
      | "running"
      | "done"
      | "err_no_paper"
      | "err_cover_open";
 }
--- a/frontend/src/views/ScanView.vue
+++ b/frontend/src/views/ScanView.vue
@ -34,7 +34,7 @@ axios.post('/api/scan')
 <template>
    <dev class="w-full h-full flex flex-col">
        <div class="w-full h-full p-2 flex flex-row flex-wrap overflow-auto">
-            <ScannedPage v-for="page in data.pages" :key="page.filename" class="w-1/5 h-1/2" :imgUrl="'/img/' + page.filename" />
+            <ScannedPage v-for="page in data.pages" :scannedPage="page" class="w-1/5 h-1/2" />
            <ScannedPage v-if="data.status==='running'" class="w-1/5 h-1/2" />
        </div>
        <div class="w-full h-28 p-4 flex">