major: integrate ocr to pdf reader

This commit is contained in:
Lunaresk 2024-08-25 22:19:53 +02:00
parent 0f6f76a66b
commit 5a453a140e
48 changed files with 250 additions and 22 deletions

View File

@ -4,6 +4,7 @@ from dotenv import load_dotenv
basedir = os.path.abspath(os.path.dirname(__file__))
load_dotenv(os.path.join(basedir, '.env'))
os.environ["TESSDATA_PREFIX"] = os.path.join(basedir, 'tessdata')
class Config(object):
SECRET_KEY = os.environ.get('SECRET_KEY') or "s0m37h!n6-obfu5c471ng"
SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', '').replace(

Binary file not shown.

View File

@ -0,0 +1 @@
tessedit_create_alto 1

View File

@ -0,0 +1,7 @@
tessedit_ambigs_training 1
load_freq_dawg 0
load_punc_dawg 0
load_system_dawg 0
load_number_dawg 0
ambigs_debug_level 3
load_fixed_length_dawgs 0

View File

@ -0,0 +1 @@
tessedit_zero_rejection T

View File

@ -0,0 +1,5 @@
load_bigram_dawg True
tessedit_enable_bigram_correction True
tessedit_bigram_debug 3
save_raw_choices True
save_alt_choices True

View File

@ -0,0 +1,12 @@
disable_character_fragments T
file_type .bl
textord_fast_pitch_test T
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_resegment_from_boxes T
tessedit_train_from_boxes T
textord_no_rejects T

View File

@ -0,0 +1,13 @@
file_type .bl
#tessedit_use_nn F
textord_fast_pitch_test T
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_resegment_from_boxes T
tessedit_train_from_boxes T
#textord_repeat_extraction F
textord_no_rejects T

View File

@ -0,0 +1 @@
tessedit_char_whitelist 0123456789-.

View File

@ -0,0 +1 @@
tessedit_write_images T

View File

@ -0,0 +1,2 @@
tessedit_create_hocr 1
hocr_font_info 0

View File

@ -0,0 +1,2 @@
interactive_display_mode T
tessedit_display_outwords T

View File

@ -0,0 +1,4 @@
textord_skewsmooth_offset 8
textord_skewsmooth_offset2 8
textord_merge_desc 0.5
textord_no_rejects 1

View File

@ -0,0 +1,2 @@
tessedit_resegment_from_line_boxes 1
tessedit_make_boxes_from_boxes 1

View File

@ -0,0 +1 @@
debug_file tesseract.log

View File

@ -0,0 +1,11 @@
file_type .bl
textord_fast_pitch_test T
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_train_line_recognizer T
textord_no_rejects T
tessedit_init_config_only T

View File

@ -0,0 +1 @@
tessedit_create_lstmbox 1

View File

@ -0,0 +1,4 @@
stopper_debug_level 1
classify_debug_level 1
segsearch_debug_level 1
language_model_debug_level 3

View File

@ -0,0 +1 @@
tessedit_create_boxfile 1

View File

@ -0,0 +1 @@
tessedit_create_pdf 1

View File

@ -0,0 +1 @@
debug_file /dev/null

View File

@ -0,0 +1,2 @@
tessedit_resegment_from_boxes 1
tessedit_make_boxes_from_boxes 1

View File

@ -0,0 +1,12 @@
textord_show_blobs 0
textord_debug_tabfind 3
textord_tabfind_show_partitions 1
textord_tabfind_show_initial_partitions 1
textord_tabfind_show_columns 1
textord_tabfind_show_blocks 1
textord_tabfind_show_initialtabs 1
textord_tabfind_show_finaltabs 1
textord_tabfind_show_strokewidths 1
textord_tabfind_show_vlines 0
textord_tabfind_show_images 1
tessedit_dump_pageseg_images 0

View File

@ -0,0 +1 @@
tessedit_create_tsv 1

View File

@ -0,0 +1,3 @@
# This config file should be used with other config files which create renderers.
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
tessedit_create_txt 1

View File

@ -0,0 +1,2 @@
tessedit_write_unlv 1
unlv_tilde_crunching T

View File

@ -0,0 +1 @@
tessedit_create_wordstrbox 1

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,2 @@
1-\d\d\d-GOOG-411
www.\n\\\*.com

View File

@ -0,0 +1,5 @@
the
quick
brown
fox
jumped

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
# No content needed as all defaults are correct.

View File

@ -0,0 +1,2 @@
chop_enable 0
wordrec_enable_assoc 0

View File

@ -0,0 +1,7 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
classify_enable_adaptive_debugger 1
matcher_debug_flags 6
matcher_debug_level 1

View File

@ -0,0 +1,12 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
classify_enable_adaptive_debugger 1
matcher_debug_flags 6
matcher_debug_level 1
wordrec_display_splits 0
wordrec_display_all_blobs 1
wordrec_display_segmentations 2
classify_debug_level 1

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,9 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
wordrec_display_splits 0
wordrec_display_all_blobs 1
wordrec_display_segmentations 2
classify_debug_level 1
stopper_debug_level 1

View File

@ -0,0 +1,38 @@
"""raise bonid digits
Revision ID: 926395732c3e
Revises: 2a64d3b9235a
Create Date: 2024-08-24 10:33:39.109944
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '926395732c3e'
down_revision = '2a64d3b9235a'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('receipt', schema=None) as batch_op:
batch_op.alter_column('bonid',
existing_type=sa.NUMERIC(precision=24, scale=0),
type_=sa.Numeric(precision=28, scale=0),
existing_nullable=True)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('receipt', schema=None) as batch_op:
batch_op.alter_column('bonid',
existing_type=sa.Numeric(precision=28, scale=0),
type_=sa.NUMERIC(precision=24, scale=0),
existing_nullable=True)
# ### end Alembic commands ###

View File

@ -4,7 +4,7 @@ from src import db
class Receipt(db.Model):
id = db.Column(db.BigInteger, primary_key=True, autoincrement=True)
date = db.Column(db.Date, nullable=False)
bonid = db.Column(db.Numeric(precision=24, scale=0), unique=True)
bonid = db.Column(db.Numeric(precision=28, scale=0), unique=True)
from_user = db.Column(db.ForeignKey("login_token.token"),
server_onupdate=db.FetchedValue())
registered = db.Column(db.Boolean, nullable=False,

View File

@ -40,7 +40,7 @@ def upload_receipt(establishment: int):
db.session.commit()
if pdfReceipt:
rename(f"{PDFDir}/temp.pdf", f"{PDFDir}{secure_filename(f'{dbReceipt.id}.pdf')}")
LOGGER.debug(receipt.text)
LOGGER.debug(receipt.words)
return redirect(url_for("receipts.check_items.confirm_receipt_items", receipt_id = dbReceipt.id))
else:
LOGGER.debug(form.errors)

View File

@ -0,0 +1,20 @@
from datetime import datetime
def getDictFromWords(words: list[tuple]):
results = {"items": []}
results['date'] = datetime.strptime(words[-6][4], "%d.%m.%y").date()
results['bonid'] = words[-1][4]
currentline = 0
skipwords = 14
for i, word in enumerate(words[skipwords:]):
if currentline != word[5]:
results['items'].append({"itemname": word[4]})
currentline = word[5]
elif word[6] == 0:
results['items'][-1]["itemname"] += " " + word[4]
if word[6] == 1 and word[7] == 1:
results['items'][-1]["price"] = word[4].split("*")[0]
if "----" in word[4]:
del(results['items'][-1])
break
return results

View File

@ -0,0 +1,27 @@
from datetime import datetime
def getDictFromWords(words: list[tuple]):
results = {"items": []}
results['bonid'] = words[-1][4]
currentline = 0
skipwords = 9
for i, word in enumerate(words[skipwords:]):
if currentline != word[5]:
results['items'].append({"itemname": word[4]})
currentline = word[5]
elif word[6] == 0:
results['items'][-1]["itemname"] += " " + word[4]
if word[6] == 1 and word[7] == 0:
if word[4].lower() == "x":
results['items'][-1]["amount"] = words[i+skipwords+1][4]
else:
results['items'][-1]["price"] = word[4]
elif word[6] == 2:
results['items'][-1]["price"] = word[4]
if word[4].lower() == "gesamt":
del(results['items'][-1])
break
for i, word in enumerate(words[::-1]):
if word[4].lower() == "datum:":
results['date'] = datetime.strptime(words[::-1][i-1][4], "%d.%m.%Y").date()
return results

View File

@ -1,5 +1,7 @@
import fitz
from datetime import datetime, date
from .edeka.edeka_parser import getDictFromWords as edekaparser
from .kaufland.kaufland_parser import getDictFromWords as kauflandparser
from re import search
class PDFReceipt:
@ -10,22 +12,29 @@ class PDFReceipt:
parser -- A keyword in lowercase to tell how the receipt is formated.
Currently supported: 'edeka'
"""
def __init__(self, bPDFFile, parser: str = "edeka") -> None:
def __init__(self, strPDFFile) -> None:
try:
self.text = PDFReceipt._getTextFromPDF(bPDFFile)
self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.text, parser)
self.words = PDFReceipt._getWordsFromPDF(strPDFFile)
storename = PDFReceipt._getStoreName(self.words)
self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.words, store = storename)
except:
self.text = "PDF konnte nicht geladen werden."
self.words = "PDF konnte nicht geladen werden."
self.date = date.today()
self.id = None
self.items = []
def _getTextFromPDF(file):
def _getWordsFromPDF(file):
with fitz.open(file, filetype="pdf") as doc:
text = ""
words = []
for page in doc:
text += page.get_text()
return text.strip()
words.extend(page.get_text("words", textpage=page.get_textpage_ocr(), sort=True))
return words
def _getStoreName(words: list[tuple]) -> str:
for word in words:
if word[4].lower() in ("edeka", "kaufland"):
return word[4].lower()
return "unknown"
def _getItemsTextFromText(text, start="", end=""):
return text[text.index(start)+len(start):text.index(end)].strip()
@ -43,21 +52,23 @@ class PDFReceipt:
i += 2
return resultsArr
def _getInfosFromText(text: str, parser: str = "edeka"):
if parser.lower() == "edeka":
items = PDFReceipt._convertItemsTextToDict(PDFReceipt._getItemsTextFromText(text, start="EUR", end="----------"))
strDate = text.split("\n")[-1].split(" ")[0]
date = datetime.strptime(strDate, "%d.%m.%y").date()
strReceiptNumber = text.split("\n")[-1].split(" ")[-1]
try:
intReceiptNumber = int(strReceiptNumber)
except:
raise ValueError("Receipt Number not an integer.")
def _getInfosFromText(words: str, store: str = "edeka"):
if store == "edeka":
result = edekaparser(words)
elif store == "kaufland":
result = kauflandparser(words)
items = result.get("items")
date = result.get("date")
strReceiptNumber = result.get("bonid")
try:
intReceiptNumber = int(strReceiptNumber)
except:
raise ValueError("Receipt Number not an integer.")
return (intReceiptNumber, date, items)
def getPDFReceiptFromFile(strPDFFile: str, parser: str = "edeka"):
def getPDFReceiptFromFile(strPDFFile: str):
try:
with open(strPDFFile) as doc:
return PDFReceipt(doc, parser)
return PDFReceipt(doc)
except FileNotFoundError as e:
return PDFReceipt(None)