Add OCR and image correction

2025-07-28 07:22:32 +02:00 · 2024-07-07 00:30:13 +02:00 · 2024-07-07 00:30:13 +02:00 · e93117aeb4
commit e93117aeb4
parent e74d3a4e27
13 changed files with 228 additions and 20 deletions
--- a/backend/app/backends/init.py
+++ b/backend/app/backends/init.py
--- a/backend/app/backends/common.py
+++ b/backend/app/backends/common.py
@ -0,0 +1,16 @@
+from PIL import Image
+import ocrmypdf
+
+def create_pdf(scanner):
+    images = []
+    for page in scanner.get_pages():
+        img = Image.open(f"/var/www/html/img/{page.filename}")
+        a4im = Image.new('RGB',
+                 (int(210 * 200 / 25.4), int(297 * 200 / 25.4)),
+                 (255, 255, 255))
+        a4im.paste(img, img.getbbox())
+        images.append(a4im)
+    images[0].save("/var/www/html/img/out.pdf", save_all=True, append_images=images[1:])
+
+def ocr_pdf():
+    ocrmypdf.ocr('/var/www/html/img/out.pdf', '/var/www/html/img/final.pdf')
--- a/backend/app/backends/email.py
+++ b/backend/app/backends/email.py
--- a/backend/app/data/schemas.py
+++ b/backend/app/data/schemas.py
@ -1,9 +1,11 @@
 from pydantic import BaseModel
+from typing import Optional
 import app.scanner.enums as scan

 class ScanPage(BaseModel):
-    filename: str
+    filename: Optional[str]
    size_bytes: int
+    status: scan.PageStatus

    class Config():
        orm_mode = True
--- a/backend/app/main.py
+++ b/backend/app/main.py
@ -1,4 +1,4 @@
-import threading
+import threading, logging
 from contextlib import asynccontextmanager
 from typing import Annotated

@ -7,13 +7,25 @@ from fastapi import FastAPI, Depends
 from app.data import models
 from app.data.database import SessionLocal, engine

+from uvicorn.logging import DefaultFormatter
+
 from app.scanner.scanner import Scanner
 from app.scanner.scanner import Status as ScannerStatus

+# Set up logging
+logger = logging.getLogger()
+__syslog = logging.StreamHandler()
+__syslog.setFormatter(DefaultFormatter(fmt="%(levelprefix)s %(message)s", use_colors=True))
+logger.setLevel(logging.INFO)
+logger.addHandler(__syslog)
+
+# Create database
 models.Base.metadata.create_all(bind=engine)

-__scanner = Scanner("/var/www/html/img")
+# Set up scanner instance
+__scanner = Scanner("/var/www/html/img", logger)

+# Preload scanner after FastAPI start
@asynccontextmanager
 async def __lifespan(app: FastAPI):
    threading.Thread(target=__scanner.preload).start()
--- a/backend/app/scanner/enums.py
+++ b/backend/app/scanner/enums.py
@ -8,6 +8,10 @@ class Status(Enum):
    ERR_NO_PAPER = "err_no_paper"
    ERR_COVER_OPEN = "err_cover_open"

+class PageStatus(Enum):
+    PROCESSING = "processing"
+    DONE = "done"
+
 class Setting(Enum):
    PAPER_SOURCE = "source"
    COLOR_MODE = "color"
--- a/backend/app/scanner/processing.py
+++ b/backend/app/scanner/processing.py
@ -0,0 +1,97 @@
+import cv2
+import numpy as np
+
+def order_points(pts):
+    '''Rearrange coordinates to order:
+      top-left, top-right, bottom-right, bottom-left'''
+    rect = np.zeros((4, 2), dtype='float32')
+    pts = np.array(pts)
+    s = pts.sum(axis=1)
+    # Top-left point will have the smallest sum.
+    rect[0] = pts[np.argmin(s)]
+    # Bottom-right point will have the largest sum.
+    rect[2] = pts[np.argmax(s)]
+ 
+    diff = np.diff(pts, axis=1)
+    # Top-right point will have the smallest difference.
+    rect[1] = pts[np.argmin(diff)]
+    # Bottom-left will have the largest difference.
+    rect[3] = pts[np.argmax(diff)]
+    # return the ordered coordinates
+    return rect.astype('int').tolist()
+
+def find_dest(pts):
+    (tl, tr, br, bl) = pts
+    # Finding the maximum width.
+    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
+    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
+    maxWidth = max(int(widthA), int(widthB))
+ 
+    # Finding the maximum height.
+    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
+    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
+    maxHeight = max(int(heightA), int(heightB))
+    # Final destination co-ordinates.
+    destination_corners = [[0, 0], [maxWidth, 0], [maxWidth, maxHeight], [0, maxHeight]]
+ 
+    return order_points(destination_corners)
+
+def correct_image(img_path):
+    img = cv2.imread(img_path)
+    # Resize image to workable size
+    dim_limit = 1080
+    max_dim = max(img.shape)
+    if max_dim > dim_limit:
+        resize_scale = dim_limit / max_dim
+        img = cv2.resize(img, None, fx=resize_scale, fy=resize_scale)
+    # Create a copy of resized original image for later use
+    orig_img = img.copy()
+    # Repeated Closing operation to remove text from the document.
+    kernel = np.ones((5, 5), np.uint8)
+    img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel, iterations=3)
+    # GrabCut
+    mask = np.zeros(img.shape[:2], np.uint8)
+    bgdModel = np.zeros((1, 65), np.float64)
+    fgdModel = np.zeros((1, 65), np.float64)
+    rect = (20, 20, img.shape[1] - 20, img.shape[0] - 20)
+    cv2.grabCut(img, mask, rect, bgdModel, fgdModel, 5, cv2.GC_INIT_WITH_RECT)
+    mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')
+    img = img * mask2[:, :, np.newaxis]
+ 
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    gray = cv2.GaussianBlur(gray, (11, 11), 0)
+    # Edge Detection.
+    canny = cv2.Canny(gray, 0, 200)
+    canny = cv2.dilate(canny, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)))
+ 
+    # Finding contours for the detected edges.
+    contours, hierarchy = cv2.findContours(canny, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
+    # Keeping only the largest detected contour.
+    page = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
+ 
+    # Detecting Edges through Contour approximation.
+    # Loop over the contours.
+    corners = None
+    if len(page) == 0:
+        return orig_img
+    for c in page:
+        # Approximate the contour.
+        epsilon = 0.02 * cv2.arcLength(c, True)
+        corners = cv2.approxPolyDP(c, epsilon, True)
+        # If our approximated contour has four points.
+        if len(corners) == 4:
+            break
+    # Sorting the corners and converting them to desired shape.
+    #corners = sorted(corners)
+    # For 4 corner points being detected.
+    corners = order_points(corners)
+ 
+    destination_corners = find_dest(corners)
+ 
+    h, w = orig_img.shape[:2]
+    # Getting the homography.
+    M = cv2.getPerspectiveTransform(corners, destination_corners, cv2.DECOMP_LU)
+    # Perspective transform using homography.
+    final = cv2.warpPerspective(orig_img, M, (destination_corners[2][0], destination_corners[2][1]),
+                                flags=cv2.INTER_LINEAR)
+    cv2.imwrite(img_path, final)
--- a/backend/app/scanner/scanner.py
+++ b/backend/app/scanner/scanner.py
@ -1,8 +1,12 @@
-import gi, os, threading
-from typing import List
+import gi, os, threading, logging
+from typing import List, Optional

 from PIL import Image
-from app.scanner.enums import Status
+from app.scanner.enums import Status, PageStatus
+from app.scanner.tesseract import Tesseract
+from app.scanner.processing import correct_image
+
+#from app.backends.common import create_pdf, ocr_pdf

 gi.require_version('Libinsane', '1.0')
 from gi.repository import Libinsane, GObject # type: ignore
@ -14,10 +18,17 @@ class __LibinsaneSilentLogger(GObject.GObject, Libinsane.Logger):
 Libinsane.register_logger(__LibinsaneSilentLogger())

 class Page:
-    filename: str
+    filename: Optional[str] = None
    size_bytes: int
+    status: PageStatus

 class Scanner:
+    def __init__(self, storage_path, logger = logging.getLogger()):
+        self.scanned_pages: List[Page] = []
+        self.logger = logger
+        self.tesseract = Tesseract(logger)
+        self.storage_path = storage_path
+        self.status = Status.INITIALIZED

    def __get_device_id(self):
        """
@ -27,6 +38,7 @@ class Scanner:
        :returns: Device id of the first scan device
        """
        devs = self.api.list_devices(Libinsane.DeviceLocations.LOCAL_ONLY)
+        self.logger.info("Using device: %s", devs[0].get_dev_id())
        return devs[0].get_dev_id()

    def __raw_to_img(self, params, img_bytes):
@ -44,15 +56,24 @@ class Scanner:
    def __write_file(self, scan_params, data, page_index, last_file):
        data = b"".join(data)
        if scan_params.get_format() == Libinsane.ImgFormat.RAW_RGB_24:
+
            filesize = len(data)
-            img = self.__raw_to_img(scan_params, data)
-            filename = f"out{page_index}.png"
-            img.save(os.path.join(self.storage_path, filename), format="PNG")
            page = Page()
-            page.filename = filename
+            page.status = PageStatus.PROCESSING
            page.size_bytes = filesize
            self.scanned_pages.append(page)
+            img = self.__raw_to_img(scan_params, data)
+            filename = f"out{page_index}.jpeg"
+            img = self.tesseract.rotate_img(img)
+            img_path = os.path.join(self.storage_path, filename)
+            img.save(img_path, format="jpeg", quality=95)
+            #correct_image(img_path)
+            page.filename = filename
+            page.status = PageStatus.DONE
+            self.scanned_pages[page_index] = page
        if last_file:
+            #self.tesseract.create_pdf(scanner=self)
+            #ocr_pdf()
            self.status = Status.DONE

    def __set_defaults(self):
@ -61,23 +82,31 @@ class Scanner:
        opts = {opt.get_name(): opt for opt in opts}
        opts["sleeptimer"].set_value(1)
        opts["resolution"].set_value(200)
+        opts["swcrop"].set_value(True)
+        opts["swdeskew"].set_value(True)
+        opts["page-height"].set_value(300)
+        opts["mode"].set_value("Color")
        dev.close()

    def __scan(self):
+        self.logger.info("Scan requested")
        self.status = Status.RUNNING
        source = self.api.get_device(self.device_id)

        opts = source.get_options()
        opts = {opt.get_name(): opt for opt in opts}
        if opts["cover-open"].get_value() == True:
+            self.logger.warn("Cover open. Can't scan.")
            self.status = Status.ERR_COVER_OPEN
            return
        
+        self.logger.info("Starting scan...")
        session = source.scan_start()
        try:
            page_index = 0
            while not session.end_of_feed() and page_index < 50:
-                # Do not assume that all the pages will have the same size !
+                self.logger.info("Processing page %s", page_index)
+                # Do not assume that all the pages will have the same size
                scan_params = session.get_scan_parameters()
                img = []
                while not session.end_of_page():
@ -88,15 +117,11 @@ class Scanner:
                t.start()
                page_index += 1
            if page_index == 0:
+                self.logger.warn("No paper. Nothing to scan.")
                self.status = Status.ERR_NO_PAPER
        finally:
            session.cancel()
            source.close()
-    
-    def __init__(self, storage_path):
-        self.scanned_pages: List[Page] = []
-        self.storage_path = storage_path
-        self.status = Status.INITIALIZED

    def preload(self):
        os.environ["LIBINSANE_NORMALIZER_SAFE_DEFAULTS"] = "0"
--- a/backend/app/scanner/tesseract.py
+++ b/backend/app/scanner/tesseract.py
@ -0,0 +1,34 @@
+import logging
+
+import pyocr
+import pyocr.libtesseract
+
+from PIL import Image
+
+class Tesseract:
+    def __init__(self, logger = logging.getLogger()):
+        self.logger = logger
+        tools = pyocr.get_available_tools()
+        if len(tools) == 0:
+            logging.error("No OCR tool found")
+        self.tool = tools[1]
+        logging.info("Will use tool '%s'" % (self.tool.get_name()))
+    
+    def rotate_img(self, image: Image.Image) -> Image.Image:
+        orientation = self.tool.detect_orientation(
+            image,
+            lang='deu'
+        )
+        logging.info("Tesseract: Rotate by %s degrees to correct (Confidence: %s)", orientation["angle"], orientation["confidence"])
+        return image.rotate(orientation["angle"], expand=True)
+    
+    def create_pdf(self, scanner):
+        builder = pyocr.libtesseract.LibtesseractPdfBuilder()
+        builder.set_lang("deu")
+        builder.set_output_file("/var/www/html/img/out")
+        for page in scanner.get_pages():
+            filename = f"/var/www/html/img/{page.filename}"
+            self.logger.info(filename)
+            img = Image.open(filename)
+            builder.add_image(img)
+        builder.build()
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -16,6 +16,7 @@ pycairo==1.24.0
 pydantic==1.10.12
 pydantic_core==2.6.3
 PyGObject==3.44.1
+pytesseract==0.3.10
 python-dateutil==2.8.2
 python-dotenv==1.0.0
 PyYAML==6.0.1
--- a/frontend/src/components/ScannedPage.vue
+++ b/frontend/src/components/ScannedPage.vue
@ -1,9 +1,10 @@
 <script setup lang="ts">
 import { ref } from 'vue';
 import LoadingSpinner from '@/components/LoadingSpinner.vue';
+import type { ScannedPage as ScannedPageType } from '@/types/scanner'

 const props = defineProps({
-    imgUrl: String
+    scannedPage: ScannedPageType
 })

 const imgLoaded = ref(false)
@ -12,7 +13,7 @@ const imgLoaded = ref(false)
    <div class="p-2">
        <div class="w-full h-full rounded-lg shadow-lg bg-white flex justify-center items-center">
            <LoadingSpinner v-if="!imgLoaded" class="w-10 h-10 text-gray-600" />
-            <img v-if="imgUrl" v-show="imgLoaded" :src="imgUrl" @load="imgLoaded=true" class="w-full h-full rounded-lg object-cover">
+            <img v-if="scannedPage.status === 'done'" v-show="imgLoaded" :src="'/img/' + scannedPage.filename" @load="imgLoaded=true" class="w-full h-full rounded-lg object-cover">
        </div>
    </div>
 </template>
--- a/frontend/src/types/scanner.d.ts
+++ b/frontend/src/types/scanner.d.ts
@ -0,0 +1,16 @@
+export interface ScannedPage {
+    filename?: string;
+	size_bytes: number;
+	status: "processing" | "done";
+}
+
+export interface ScanStatus {
+    pages: Array<ScannedPage>;
+    status:
+      | "initialized"
+      | "idle"
+      | "running"
+      | "done"
+      | "err_no_paper"
+      | "err_cover_open";
+}
--- a/frontend/src/views/ScanView.vue
+++ b/frontend/src/views/ScanView.vue
@ -34,7 +34,7 @@ axios.post('/api/scan')
 <template>
    <dev class="w-full h-full flex flex-col">
        <div class="w-full h-full p-2 flex flex-row flex-wrap overflow-auto">
-            <ScannedPage v-for="page in data.pages" :key="page.filename" class="w-1/5 h-1/2" :imgUrl="'/img/' + page.filename" />
+            <ScannedPage v-for="page in data.pages" :scannedPage="page" class="w-1/5 h-1/2" />
            <ScannedPage v-if="data.status==='running'" class="w-1/5 h-1/2" />
        </div>
        <div class="w-full h-28 p-4 flex">