major: integrate ocr to pdf reader
This commit is contained in:
parent
0f6f76a66b
commit
d695e90c82
@ -4,6 +4,7 @@ from dotenv import load_dotenv
|
||||
basedir = os.path.abspath(os.path.dirname(__file__))
|
||||
load_dotenv(os.path.join(basedir, '.env'))
|
||||
|
||||
os.environ["TESSDATA_PREFIX"] = os.path.join(basedir, 'tessdata')
|
||||
class Config(object):
|
||||
SECRET_KEY = os.environ.get('SECRET_KEY') or "s0m37h!n6-obfu5c471ng"
|
||||
SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', '').replace(
|
||||
|
||||
BIN
backend/configs/tessdata/ScrollView.jar
Normal file
BIN
backend/configs/tessdata/ScrollView.jar
Normal file
Binary file not shown.
1
backend/configs/tessdata/configs/alto
Normal file
1
backend/configs/tessdata/configs/alto
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_alto 1
|
||||
7
backend/configs/tessdata/configs/ambigs.train
Normal file
7
backend/configs/tessdata/configs/ambigs.train
Normal file
@ -0,0 +1,7 @@
|
||||
tessedit_ambigs_training 1
|
||||
load_freq_dawg 0
|
||||
load_punc_dawg 0
|
||||
load_system_dawg 0
|
||||
load_number_dawg 0
|
||||
ambigs_debug_level 3
|
||||
load_fixed_length_dawgs 0
|
||||
1
backend/configs/tessdata/configs/api_config
Normal file
1
backend/configs/tessdata/configs/api_config
Normal file
@ -0,0 +1 @@
|
||||
tessedit_zero_rejection T
|
||||
5
backend/configs/tessdata/configs/bigram
Normal file
5
backend/configs/tessdata/configs/bigram
Normal file
@ -0,0 +1,5 @@
|
||||
load_bigram_dawg True
|
||||
tessedit_enable_bigram_correction True
|
||||
tessedit_bigram_debug 3
|
||||
save_raw_choices True
|
||||
save_alt_choices True
|
||||
12
backend/configs/tessdata/configs/box.train
Normal file
12
backend/configs/tessdata/configs/box.train
Normal file
@ -0,0 +1,12 @@
|
||||
disable_character_fragments T
|
||||
file_type .bl
|
||||
textord_fast_pitch_test T
|
||||
tessedit_zero_rejection T
|
||||
tessedit_minimal_rejection F
|
||||
tessedit_write_rep_codes F
|
||||
edges_children_fix F
|
||||
edges_childarea 0.65
|
||||
edges_boxarea 0.9
|
||||
tessedit_resegment_from_boxes T
|
||||
tessedit_train_from_boxes T
|
||||
textord_no_rejects T
|
||||
13
backend/configs/tessdata/configs/box.train.stderr
Normal file
13
backend/configs/tessdata/configs/box.train.stderr
Normal file
@ -0,0 +1,13 @@
|
||||
file_type .bl
|
||||
#tessedit_use_nn F
|
||||
textord_fast_pitch_test T
|
||||
tessedit_zero_rejection T
|
||||
tessedit_minimal_rejection F
|
||||
tessedit_write_rep_codes F
|
||||
edges_children_fix F
|
||||
edges_childarea 0.65
|
||||
edges_boxarea 0.9
|
||||
tessedit_resegment_from_boxes T
|
||||
tessedit_train_from_boxes T
|
||||
#textord_repeat_extraction F
|
||||
textord_no_rejects T
|
||||
1
backend/configs/tessdata/configs/digits
Normal file
1
backend/configs/tessdata/configs/digits
Normal file
@ -0,0 +1 @@
|
||||
tessedit_char_whitelist 0123456789-.
|
||||
1
backend/configs/tessdata/configs/get.images
Normal file
1
backend/configs/tessdata/configs/get.images
Normal file
@ -0,0 +1 @@
|
||||
tessedit_write_images T
|
||||
2
backend/configs/tessdata/configs/hocr
Normal file
2
backend/configs/tessdata/configs/hocr
Normal file
@ -0,0 +1,2 @@
|
||||
tessedit_create_hocr 1
|
||||
hocr_font_info 0
|
||||
2
backend/configs/tessdata/configs/inter
Normal file
2
backend/configs/tessdata/configs/inter
Normal file
@ -0,0 +1,2 @@
|
||||
interactive_display_mode T
|
||||
tessedit_display_outwords T
|
||||
4
backend/configs/tessdata/configs/kannada
Normal file
4
backend/configs/tessdata/configs/kannada
Normal file
@ -0,0 +1,4 @@
|
||||
textord_skewsmooth_offset 8
|
||||
textord_skewsmooth_offset2 8
|
||||
textord_merge_desc 0.5
|
||||
textord_no_rejects 1
|
||||
2
backend/configs/tessdata/configs/linebox
Normal file
2
backend/configs/tessdata/configs/linebox
Normal file
@ -0,0 +1,2 @@
|
||||
tessedit_resegment_from_line_boxes 1
|
||||
tessedit_make_boxes_from_boxes 1
|
||||
1
backend/configs/tessdata/configs/logfile
Normal file
1
backend/configs/tessdata/configs/logfile
Normal file
@ -0,0 +1 @@
|
||||
debug_file tesseract.log
|
||||
11
backend/configs/tessdata/configs/lstm.train
Normal file
11
backend/configs/tessdata/configs/lstm.train
Normal file
@ -0,0 +1,11 @@
|
||||
file_type .bl
|
||||
textord_fast_pitch_test T
|
||||
tessedit_zero_rejection T
|
||||
tessedit_minimal_rejection F
|
||||
tessedit_write_rep_codes F
|
||||
edges_children_fix F
|
||||
edges_childarea 0.65
|
||||
edges_boxarea 0.9
|
||||
tessedit_train_line_recognizer T
|
||||
textord_no_rejects T
|
||||
tessedit_init_config_only T
|
||||
1
backend/configs/tessdata/configs/lstmbox
Normal file
1
backend/configs/tessdata/configs/lstmbox
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_lstmbox 1
|
||||
4
backend/configs/tessdata/configs/lstmdebug
Normal file
4
backend/configs/tessdata/configs/lstmdebug
Normal file
@ -0,0 +1,4 @@
|
||||
stopper_debug_level 1
|
||||
classify_debug_level 1
|
||||
segsearch_debug_level 1
|
||||
language_model_debug_level 3
|
||||
1
backend/configs/tessdata/configs/makebox
Normal file
1
backend/configs/tessdata/configs/makebox
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_boxfile 1
|
||||
1
backend/configs/tessdata/configs/pdf
Normal file
1
backend/configs/tessdata/configs/pdf
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_pdf 1
|
||||
1
backend/configs/tessdata/configs/quiet
Normal file
1
backend/configs/tessdata/configs/quiet
Normal file
@ -0,0 +1 @@
|
||||
debug_file /dev/null
|
||||
2
backend/configs/tessdata/configs/rebox
Normal file
2
backend/configs/tessdata/configs/rebox
Normal file
@ -0,0 +1,2 @@
|
||||
tessedit_resegment_from_boxes 1
|
||||
tessedit_make_boxes_from_boxes 1
|
||||
12
backend/configs/tessdata/configs/strokewidth
Normal file
12
backend/configs/tessdata/configs/strokewidth
Normal file
@ -0,0 +1,12 @@
|
||||
textord_show_blobs 0
|
||||
textord_debug_tabfind 3
|
||||
textord_tabfind_show_partitions 1
|
||||
textord_tabfind_show_initial_partitions 1
|
||||
textord_tabfind_show_columns 1
|
||||
textord_tabfind_show_blocks 1
|
||||
textord_tabfind_show_initialtabs 1
|
||||
textord_tabfind_show_finaltabs 1
|
||||
textord_tabfind_show_strokewidths 1
|
||||
textord_tabfind_show_vlines 0
|
||||
textord_tabfind_show_images 1
|
||||
tessedit_dump_pageseg_images 0
|
||||
1
backend/configs/tessdata/configs/tsv
Normal file
1
backend/configs/tessdata/configs/tsv
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_tsv 1
|
||||
3
backend/configs/tessdata/configs/txt
Normal file
3
backend/configs/tessdata/configs/txt
Normal file
@ -0,0 +1,3 @@
|
||||
# This config file should be used with other config files which create renderers.
|
||||
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
|
||||
tessedit_create_txt 1
|
||||
2
backend/configs/tessdata/configs/unlv
Normal file
2
backend/configs/tessdata/configs/unlv
Normal file
@ -0,0 +1,2 @@
|
||||
tessedit_write_unlv 1
|
||||
unlv_tilde_crunching T
|
||||
1
backend/configs/tessdata/configs/wordstrbox
Normal file
1
backend/configs/tessdata/configs/wordstrbox
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_wordstrbox 1
|
||||
BIN
backend/configs/tessdata/deu.traineddata.old
Normal file
BIN
backend/configs/tessdata/deu.traineddata.old
Normal file
Binary file not shown.
BIN
backend/configs/tessdata/eng.traineddata
Normal file
BIN
backend/configs/tessdata/eng.traineddata
Normal file
Binary file not shown.
2
backend/configs/tessdata/eng.user-patterns
Normal file
2
backend/configs/tessdata/eng.user-patterns
Normal file
@ -0,0 +1,2 @@
|
||||
1-\d\d\d-GOOG-411
|
||||
www.\n\\\*.com
|
||||
5
backend/configs/tessdata/eng.user-words
Normal file
5
backend/configs/tessdata/eng.user-words
Normal file
@ -0,0 +1,5 @@
|
||||
the
|
||||
quick
|
||||
brown
|
||||
fox
|
||||
jumped
|
||||
BIN
backend/configs/tessdata/jaxb-api-2.3.1.jar
Normal file
BIN
backend/configs/tessdata/jaxb-api-2.3.1.jar
Normal file
Binary file not shown.
BIN
backend/configs/tessdata/osd.traineddata
Normal file
BIN
backend/configs/tessdata/osd.traineddata
Normal file
Binary file not shown.
BIN
backend/configs/tessdata/pdf.ttf
Normal file
BIN
backend/configs/tessdata/pdf.ttf
Normal file
Binary file not shown.
BIN
backend/configs/tessdata/piccolo2d-core-3.0.1.jar
Normal file
BIN
backend/configs/tessdata/piccolo2d-core-3.0.1.jar
Normal file
Binary file not shown.
BIN
backend/configs/tessdata/piccolo2d-extras-3.0.1.jar
Normal file
BIN
backend/configs/tessdata/piccolo2d-extras-3.0.1.jar
Normal file
Binary file not shown.
1
backend/configs/tessdata/tessconfigs/batch
Normal file
1
backend/configs/tessdata/tessconfigs/batch
Normal file
@ -0,0 +1 @@
|
||||
# No content needed as all defaults are correct.
|
||||
2
backend/configs/tessdata/tessconfigs/batch.nochop
Normal file
2
backend/configs/tessdata/tessconfigs/batch.nochop
Normal file
@ -0,0 +1,2 @@
|
||||
chop_enable 0
|
||||
wordrec_enable_assoc 0
|
||||
7
backend/configs/tessdata/tessconfigs/matdemo
Normal file
7
backend/configs/tessdata/tessconfigs/matdemo
Normal file
@ -0,0 +1,7 @@
|
||||
#################################################
|
||||
# Adaptive Matcher Using PreAdapted Templates
|
||||
#################################################
|
||||
|
||||
classify_enable_adaptive_debugger 1
|
||||
matcher_debug_flags 6
|
||||
matcher_debug_level 1
|
||||
12
backend/configs/tessdata/tessconfigs/msdemo
Normal file
12
backend/configs/tessdata/tessconfigs/msdemo
Normal file
@ -0,0 +1,12 @@
|
||||
#################################################
|
||||
# Adaptive Matcher Using PreAdapted Templates
|
||||
#################################################
|
||||
|
||||
classify_enable_adaptive_debugger 1
|
||||
matcher_debug_flags 6
|
||||
matcher_debug_level 1
|
||||
|
||||
wordrec_display_splits 0
|
||||
wordrec_display_all_blobs 1
|
||||
wordrec_display_segmentations 2
|
||||
classify_debug_level 1
|
||||
1
backend/configs/tessdata/tessconfigs/nobatch
Normal file
1
backend/configs/tessdata/tessconfigs/nobatch
Normal file
@ -0,0 +1 @@
|
||||
|
||||
9
backend/configs/tessdata/tessconfigs/segdemo
Normal file
9
backend/configs/tessdata/tessconfigs/segdemo
Normal file
@ -0,0 +1,9 @@
|
||||
#################################################
|
||||
# Adaptive Matcher Using PreAdapted Templates
|
||||
#################################################
|
||||
|
||||
wordrec_display_splits 0
|
||||
wordrec_display_all_blobs 1
|
||||
wordrec_display_segmentations 2
|
||||
classify_debug_level 1
|
||||
stopper_debug_level 1
|
||||
@ -0,0 +1,38 @@
|
||||
"""raise bonid digits
|
||||
|
||||
Revision ID: 926395732c3e
|
||||
Revises: 2a64d3b9235a
|
||||
Create Date: 2024-08-24 10:33:39.109944
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '926395732c3e'
|
||||
down_revision = '2a64d3b9235a'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('receipt', schema=None) as batch_op:
|
||||
batch_op.alter_column('bonid',
|
||||
existing_type=sa.NUMERIC(precision=24, scale=0),
|
||||
type_=sa.Numeric(precision=28, scale=0),
|
||||
existing_nullable=True)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('receipt', schema=None) as batch_op:
|
||||
batch_op.alter_column('bonid',
|
||||
existing_type=sa.Numeric(precision=28, scale=0),
|
||||
type_=sa.NUMERIC(precision=24, scale=0),
|
||||
existing_nullable=True)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
@ -4,7 +4,7 @@ from src import db
|
||||
class Receipt(db.Model):
|
||||
id = db.Column(db.BigInteger, primary_key=True, autoincrement=True)
|
||||
date = db.Column(db.Date, nullable=False)
|
||||
bonid = db.Column(db.Numeric(precision=24, scale=0), unique=True)
|
||||
bonid = db.Column(db.Numeric(precision=28, scale=0), unique=True)
|
||||
from_user = db.Column(db.ForeignKey("login_token.token"),
|
||||
server_onupdate=db.FetchedValue())
|
||||
registered = db.Column(db.Boolean, nullable=False,
|
||||
|
||||
@ -40,7 +40,7 @@ def upload_receipt(establishment: int):
|
||||
db.session.commit()
|
||||
if pdfReceipt:
|
||||
rename(f"{PDFDir}/temp.pdf", f"{PDFDir}{secure_filename(f'{dbReceipt.id}.pdf')}")
|
||||
LOGGER.debug(receipt.text)
|
||||
LOGGER.debug(receipt.words)
|
||||
return redirect(url_for("receipts.check_items.confirm_receipt_items", receipt_id = dbReceipt.id))
|
||||
else:
|
||||
LOGGER.debug(form.errors)
|
||||
|
||||
@ -0,0 +1,20 @@
|
||||
from datetime import datetime
|
||||
|
||||
def getDictFromWords(words: list[tuple]):
|
||||
results = {"items": []}
|
||||
results['date'] = datetime.strptime(words[-6][4], "%d.%m.%y").date()
|
||||
results['bonid'] = words[-1][4]
|
||||
currentline = 0
|
||||
skipwords = 14
|
||||
for i, word in enumerate(words[skipwords:]):
|
||||
if currentline != word[5]:
|
||||
results['items'].append({"itemname": word[4]})
|
||||
currentline = word[5]
|
||||
elif word[6] == 0:
|
||||
results['items'][-1]["itemname"] += " " + word[4]
|
||||
if word[6] == 1 and word[7] == 1:
|
||||
results['items'][-1]["price"] = word[4].split("*")[0]
|
||||
if "----" in word[4]:
|
||||
del(results['items'][-1])
|
||||
break
|
||||
return results
|
||||
@ -0,0 +1,27 @@
|
||||
from datetime import datetime
|
||||
|
||||
def getDictFromWords(words: list[tuple]):
|
||||
results = {"items": []}
|
||||
results['bonid'] = words[-1][4]
|
||||
currentline = 0
|
||||
skipwords = 9
|
||||
for i, word in enumerate(words[skipwords:]):
|
||||
if currentline != word[5]:
|
||||
results['items'].append({"itemname": word[4]})
|
||||
currentline = word[5]
|
||||
elif word[6] == 0:
|
||||
results['items'][-1]["itemname"] += " " + word[4]
|
||||
if word[6] == 1 and word[7] == 0:
|
||||
if word[4].lower() == "x":
|
||||
results['items'][-1]["amount"] = words[i+skipwords+1][4]
|
||||
else:
|
||||
results['items'][-1]["price"] = word[4]
|
||||
elif word[6] == 2:
|
||||
results['items'][-1]["price"] = word[4]
|
||||
if word[4].lower() == "gesamt":
|
||||
del(results['items'][-1])
|
||||
break
|
||||
for i, word in enumerate(words[::-1]):
|
||||
if word[4].lower() == "datum:":
|
||||
results['date'] = datetime.strptime(words[::-1][i-1][4], "%d.%m.%Y").date()
|
||||
return results
|
||||
@ -1,5 +1,7 @@
|
||||
import fitz
|
||||
from datetime import datetime, date
|
||||
from .edeka.edeka_parser import getDictFromWords as edekaparser
|
||||
from .kaufland.kaufland_parser import getDictFromWords as kauflandparser
|
||||
from re import search
|
||||
|
||||
class PDFReceipt:
|
||||
@ -10,22 +12,29 @@ class PDFReceipt:
|
||||
parser -- A keyword in lowercase to tell how the receipt is formated.
|
||||
Currently supported: 'edeka'
|
||||
"""
|
||||
def __init__(self, bPDFFile, parser: str = "edeka") -> None:
|
||||
def __init__(self, strPDFFile) -> None:
|
||||
try:
|
||||
self.text = PDFReceipt._getTextFromPDF(bPDFFile)
|
||||
self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.text, parser)
|
||||
self.words = PDFReceipt._getWordsFromPDF(strPDFFile)
|
||||
storename = PDFReceipt._getStoreName(self.words)
|
||||
self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.words, store = storename)
|
||||
except:
|
||||
self.text = "PDF konnte nicht geladen werden."
|
||||
self.words = "PDF konnte nicht geladen werden."
|
||||
self.date = date.today()
|
||||
self.id = None
|
||||
self.items = []
|
||||
|
||||
def _getTextFromPDF(file):
|
||||
def _getWordsFromPDF(file):
|
||||
with fitz.open(file, filetype="pdf") as doc:
|
||||
text = ""
|
||||
words = []
|
||||
for page in doc:
|
||||
text += page.get_text()
|
||||
return text.strip()
|
||||
words.extend(page.get_text("words", textpage=page.get_textpage_ocr(), sort=True))
|
||||
return words
|
||||
|
||||
def _getStoreName(words: list[tuple]) -> str:
|
||||
for word in words:
|
||||
if word[4].lower() in ("edeka", "kaufland"):
|
||||
return word[4].lower()
|
||||
return "unknown"
|
||||
|
||||
def _getItemsTextFromText(text, start="", end=""):
|
||||
return text[text.index(start)+len(start):text.index(end)].strip()
|
||||
@ -43,21 +52,23 @@ class PDFReceipt:
|
||||
i += 2
|
||||
return resultsArr
|
||||
|
||||
def _getInfosFromText(text: str, parser: str = "edeka"):
|
||||
if parser.lower() == "edeka":
|
||||
items = PDFReceipt._convertItemsTextToDict(PDFReceipt._getItemsTextFromText(text, start="EUR", end="----------"))
|
||||
strDate = text.split("\n")[-1].split(" ")[0]
|
||||
date = datetime.strptime(strDate, "%d.%m.%y").date()
|
||||
strReceiptNumber = text.split("\n")[-1].split(" ")[-1]
|
||||
def _getInfosFromText(words: str, store: str = "edeka"):
|
||||
if store == "edeka":
|
||||
result = edekaparser(words)
|
||||
elif store == "kaufland":
|
||||
result = kauflandparser(words)
|
||||
items = result.get("items")
|
||||
date = result.get("date")
|
||||
strReceiptNumber = result.get("bonid")
|
||||
try:
|
||||
intReceiptNumber = int(strReceiptNumber)
|
||||
except:
|
||||
raise ValueError("Receipt Number not an integer.")
|
||||
return (intReceiptNumber, date, items)
|
||||
|
||||
def getPDFReceiptFromFile(strPDFFile: str, parser: str = "edeka"):
|
||||
def getPDFReceiptFromFile(strPDFFile: str):
|
||||
try:
|
||||
with open(strPDFFile) as doc:
|
||||
return PDFReceipt(doc, parser)
|
||||
return PDFReceipt(doc)
|
||||
except FileNotFoundError as e:
|
||||
return PDFReceipt(None)
|
||||
Loading…
x
Reference in New Issue
Block a user