diff --git a/backend/configs/config.py b/backend/configs/config.py index 32543c8..bbc6e6e 100644 --- a/backend/configs/config.py +++ b/backend/configs/config.py @@ -4,6 +4,7 @@ from dotenv import load_dotenv basedir = os.path.abspath(os.path.dirname(__file__)) load_dotenv(os.path.join(basedir, '.env')) +os.environ["TESSDATA_PREFIX"] = os.path.join(basedir, 'tessdata') class Config(object): SECRET_KEY = os.environ.get('SECRET_KEY') or "s0m37h!n6-obfu5c471ng" SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', '').replace( diff --git a/backend/configs/tessdata/ScrollView.jar b/backend/configs/tessdata/ScrollView.jar new file mode 100644 index 0000000..824eda1 Binary files /dev/null and b/backend/configs/tessdata/ScrollView.jar differ diff --git a/backend/configs/tessdata/configs/alto b/backend/configs/tessdata/configs/alto new file mode 100644 index 0000000..0dd12a7 --- /dev/null +++ b/backend/configs/tessdata/configs/alto @@ -0,0 +1 @@ +tessedit_create_alto 1 diff --git a/backend/configs/tessdata/configs/ambigs.train b/backend/configs/tessdata/configs/ambigs.train new file mode 100644 index 0000000..23035a1 --- /dev/null +++ b/backend/configs/tessdata/configs/ambigs.train @@ -0,0 +1,7 @@ +tessedit_ambigs_training 1 +load_freq_dawg 0 +load_punc_dawg 0 +load_system_dawg 0 +load_number_dawg 0 +ambigs_debug_level 3 +load_fixed_length_dawgs 0 diff --git a/backend/configs/tessdata/configs/api_config b/backend/configs/tessdata/configs/api_config new file mode 100644 index 0000000..5cd6ec0 --- /dev/null +++ b/backend/configs/tessdata/configs/api_config @@ -0,0 +1 @@ +tessedit_zero_rejection T diff --git a/backend/configs/tessdata/configs/bigram b/backend/configs/tessdata/configs/bigram new file mode 100644 index 0000000..5d6c2d0 --- /dev/null +++ b/backend/configs/tessdata/configs/bigram @@ -0,0 +1,5 @@ +load_bigram_dawg True +tessedit_enable_bigram_correction True +tessedit_bigram_debug 3 +save_raw_choices True +save_alt_choices True diff --git a/backend/configs/tessdata/configs/box.train b/backend/configs/tessdata/configs/box.train new file mode 100644 index 0000000..d39f268 --- /dev/null +++ b/backend/configs/tessdata/configs/box.train @@ -0,0 +1,12 @@ +disable_character_fragments T +file_type .bl +textord_fast_pitch_test T +tessedit_zero_rejection T +tessedit_minimal_rejection F +tessedit_write_rep_codes F +edges_children_fix F +edges_childarea 0.65 +edges_boxarea 0.9 +tessedit_resegment_from_boxes T +tessedit_train_from_boxes T +textord_no_rejects T diff --git a/backend/configs/tessdata/configs/box.train.stderr b/backend/configs/tessdata/configs/box.train.stderr new file mode 100644 index 0000000..82754e9 --- /dev/null +++ b/backend/configs/tessdata/configs/box.train.stderr @@ -0,0 +1,13 @@ +file_type .bl +#tessedit_use_nn F +textord_fast_pitch_test T +tessedit_zero_rejection T +tessedit_minimal_rejection F +tessedit_write_rep_codes F +edges_children_fix F +edges_childarea 0.65 +edges_boxarea 0.9 +tessedit_resegment_from_boxes T +tessedit_train_from_boxes T +#textord_repeat_extraction F +textord_no_rejects T diff --git a/backend/configs/tessdata/configs/digits b/backend/configs/tessdata/configs/digits new file mode 100644 index 0000000..6a329f8 --- /dev/null +++ b/backend/configs/tessdata/configs/digits @@ -0,0 +1 @@ +tessedit_char_whitelist 0123456789-. diff --git a/backend/configs/tessdata/configs/get.images b/backend/configs/tessdata/configs/get.images new file mode 100644 index 0000000..7d00b61 --- /dev/null +++ b/backend/configs/tessdata/configs/get.images @@ -0,0 +1 @@ +tessedit_write_images T diff --git a/backend/configs/tessdata/configs/hocr b/backend/configs/tessdata/configs/hocr new file mode 100644 index 0000000..5ab372e --- /dev/null +++ b/backend/configs/tessdata/configs/hocr @@ -0,0 +1,2 @@ +tessedit_create_hocr 1 +hocr_font_info 0 diff --git a/backend/configs/tessdata/configs/inter b/backend/configs/tessdata/configs/inter new file mode 100644 index 0000000..252f1a1 --- /dev/null +++ b/backend/configs/tessdata/configs/inter @@ -0,0 +1,2 @@ +interactive_display_mode T +tessedit_display_outwords T diff --git a/backend/configs/tessdata/configs/kannada b/backend/configs/tessdata/configs/kannada new file mode 100644 index 0000000..c6ac105 --- /dev/null +++ b/backend/configs/tessdata/configs/kannada @@ -0,0 +1,4 @@ +textord_skewsmooth_offset 8 +textord_skewsmooth_offset2 8 +textord_merge_desc 0.5 +textord_no_rejects 1 diff --git a/backend/configs/tessdata/configs/linebox b/backend/configs/tessdata/configs/linebox new file mode 100644 index 0000000..bd9c114 --- /dev/null +++ b/backend/configs/tessdata/configs/linebox @@ -0,0 +1,2 @@ +tessedit_resegment_from_line_boxes 1 +tessedit_make_boxes_from_boxes 1 diff --git a/backend/configs/tessdata/configs/logfile b/backend/configs/tessdata/configs/logfile new file mode 100644 index 0000000..a160f9b --- /dev/null +++ b/backend/configs/tessdata/configs/logfile @@ -0,0 +1 @@ +debug_file tesseract.log diff --git a/backend/configs/tessdata/configs/lstm.train b/backend/configs/tessdata/configs/lstm.train new file mode 100644 index 0000000..5ff3772 --- /dev/null +++ b/backend/configs/tessdata/configs/lstm.train @@ -0,0 +1,11 @@ +file_type .bl +textord_fast_pitch_test T +tessedit_zero_rejection T +tessedit_minimal_rejection F +tessedit_write_rep_codes F +edges_children_fix F +edges_childarea 0.65 +edges_boxarea 0.9 +tessedit_train_line_recognizer T +textord_no_rejects T +tessedit_init_config_only T diff --git a/backend/configs/tessdata/configs/lstmbox b/backend/configs/tessdata/configs/lstmbox new file mode 100644 index 0000000..a6f2ced --- /dev/null +++ b/backend/configs/tessdata/configs/lstmbox @@ -0,0 +1 @@ +tessedit_create_lstmbox 1 diff --git a/backend/configs/tessdata/configs/lstmdebug b/backend/configs/tessdata/configs/lstmdebug new file mode 100644 index 0000000..3fa3dee --- /dev/null +++ b/backend/configs/tessdata/configs/lstmdebug @@ -0,0 +1,4 @@ +stopper_debug_level 1 +classify_debug_level 1 +segsearch_debug_level 1 +language_model_debug_level 3 diff --git a/backend/configs/tessdata/configs/makebox b/backend/configs/tessdata/configs/makebox new file mode 100644 index 0000000..3d90ac2 --- /dev/null +++ b/backend/configs/tessdata/configs/makebox @@ -0,0 +1 @@ +tessedit_create_boxfile 1 diff --git a/backend/configs/tessdata/configs/pdf b/backend/configs/tessdata/configs/pdf new file mode 100644 index 0000000..59645d7 --- /dev/null +++ b/backend/configs/tessdata/configs/pdf @@ -0,0 +1 @@ +tessedit_create_pdf 1 diff --git a/backend/configs/tessdata/configs/quiet b/backend/configs/tessdata/configs/quiet new file mode 100644 index 0000000..35b59a9 --- /dev/null +++ b/backend/configs/tessdata/configs/quiet @@ -0,0 +1 @@ +debug_file /dev/null diff --git a/backend/configs/tessdata/configs/rebox b/backend/configs/tessdata/configs/rebox new file mode 100644 index 0000000..f8342b4 --- /dev/null +++ b/backend/configs/tessdata/configs/rebox @@ -0,0 +1,2 @@ +tessedit_resegment_from_boxes 1 +tessedit_make_boxes_from_boxes 1 diff --git a/backend/configs/tessdata/configs/strokewidth b/backend/configs/tessdata/configs/strokewidth new file mode 100644 index 0000000..e95b592 --- /dev/null +++ b/backend/configs/tessdata/configs/strokewidth @@ -0,0 +1,12 @@ +textord_show_blobs 0 +textord_debug_tabfind 3 +textord_tabfind_show_partitions 1 +textord_tabfind_show_initial_partitions 1 +textord_tabfind_show_columns 1 +textord_tabfind_show_blocks 1 +textord_tabfind_show_initialtabs 1 +textord_tabfind_show_finaltabs 1 +textord_tabfind_show_strokewidths 1 +textord_tabfind_show_vlines 0 +textord_tabfind_show_images 1 +tessedit_dump_pageseg_images 0 diff --git a/backend/configs/tessdata/configs/tsv b/backend/configs/tessdata/configs/tsv new file mode 100644 index 0000000..dc52478 --- /dev/null +++ b/backend/configs/tessdata/configs/tsv @@ -0,0 +1 @@ +tessedit_create_tsv 1 diff --git a/backend/configs/tessdata/configs/txt b/backend/configs/tessdata/configs/txt new file mode 100644 index 0000000..a0cc952 --- /dev/null +++ b/backend/configs/tessdata/configs/txt @@ -0,0 +1,3 @@ +# This config file should be used with other config files which create renderers. +# usage example: tesseract eurotext.tif eurotext txt hocr pdf +tessedit_create_txt 1 diff --git a/backend/configs/tessdata/configs/unlv b/backend/configs/tessdata/configs/unlv new file mode 100644 index 0000000..d2e22f5 --- /dev/null +++ b/backend/configs/tessdata/configs/unlv @@ -0,0 +1,2 @@ +tessedit_write_unlv 1 +unlv_tilde_crunching T diff --git a/backend/configs/tessdata/configs/wordstrbox b/backend/configs/tessdata/configs/wordstrbox new file mode 100644 index 0000000..38cd41c --- /dev/null +++ b/backend/configs/tessdata/configs/wordstrbox @@ -0,0 +1 @@ +tessedit_create_wordstrbox 1 diff --git a/backend/configs/tessdata/deu.traineddata.old b/backend/configs/tessdata/deu.traineddata.old new file mode 100644 index 0000000..97ed7b2 Binary files /dev/null and b/backend/configs/tessdata/deu.traineddata.old differ diff --git a/backend/configs/tessdata/eng.traineddata b/backend/configs/tessdata/eng.traineddata new file mode 100644 index 0000000..af9cc91 Binary files /dev/null and b/backend/configs/tessdata/eng.traineddata differ diff --git a/backend/configs/tessdata/eng.user-patterns b/backend/configs/tessdata/eng.user-patterns new file mode 100644 index 0000000..5daba44 --- /dev/null +++ b/backend/configs/tessdata/eng.user-patterns @@ -0,0 +1,2 @@ +1-\d\d\d-GOOG-411 +www.\n\\\*.com diff --git a/backend/configs/tessdata/eng.user-words b/backend/configs/tessdata/eng.user-words new file mode 100644 index 0000000..e0c5a63 --- /dev/null +++ b/backend/configs/tessdata/eng.user-words @@ -0,0 +1,5 @@ +the +quick +brown +fox +jumped diff --git a/backend/configs/tessdata/jaxb-api-2.3.1.jar b/backend/configs/tessdata/jaxb-api-2.3.1.jar new file mode 100644 index 0000000..4565865 Binary files /dev/null and b/backend/configs/tessdata/jaxb-api-2.3.1.jar differ diff --git a/backend/configs/tessdata/osd.traineddata b/backend/configs/tessdata/osd.traineddata new file mode 100644 index 0000000..527457c Binary files /dev/null and b/backend/configs/tessdata/osd.traineddata differ diff --git a/backend/configs/tessdata/pdf.ttf b/backend/configs/tessdata/pdf.ttf new file mode 100644 index 0000000..d1472b2 Binary files /dev/null and b/backend/configs/tessdata/pdf.ttf differ diff --git a/backend/configs/tessdata/piccolo2d-core-3.0.1.jar b/backend/configs/tessdata/piccolo2d-core-3.0.1.jar new file mode 100644 index 0000000..df84ed5 Binary files /dev/null and b/backend/configs/tessdata/piccolo2d-core-3.0.1.jar differ diff --git a/backend/configs/tessdata/piccolo2d-extras-3.0.1.jar b/backend/configs/tessdata/piccolo2d-extras-3.0.1.jar new file mode 100644 index 0000000..daf51c0 Binary files /dev/null and b/backend/configs/tessdata/piccolo2d-extras-3.0.1.jar differ diff --git a/backend/configs/tessdata/tessconfigs/batch b/backend/configs/tessdata/tessconfigs/batch new file mode 100644 index 0000000..a681e4a --- /dev/null +++ b/backend/configs/tessdata/tessconfigs/batch @@ -0,0 +1 @@ +# No content needed as all defaults are correct. diff --git a/backend/configs/tessdata/tessconfigs/batch.nochop b/backend/configs/tessdata/tessconfigs/batch.nochop new file mode 100644 index 0000000..ebaab94 --- /dev/null +++ b/backend/configs/tessdata/tessconfigs/batch.nochop @@ -0,0 +1,2 @@ +chop_enable 0 +wordrec_enable_assoc 0 diff --git a/backend/configs/tessdata/tessconfigs/matdemo b/backend/configs/tessdata/tessconfigs/matdemo new file mode 100644 index 0000000..c34567b --- /dev/null +++ b/backend/configs/tessdata/tessconfigs/matdemo @@ -0,0 +1,7 @@ +################################################# +# Adaptive Matcher Using PreAdapted Templates +################################################# + +classify_enable_adaptive_debugger 1 +matcher_debug_flags 6 +matcher_debug_level 1 diff --git a/backend/configs/tessdata/tessconfigs/msdemo b/backend/configs/tessdata/tessconfigs/msdemo new file mode 100644 index 0000000..9c1184a --- /dev/null +++ b/backend/configs/tessdata/tessconfigs/msdemo @@ -0,0 +1,12 @@ +################################################# +# Adaptive Matcher Using PreAdapted Templates +################################################# + +classify_enable_adaptive_debugger 1 +matcher_debug_flags 6 +matcher_debug_level 1 + +wordrec_display_splits 0 +wordrec_display_all_blobs 1 +wordrec_display_segmentations 2 +classify_debug_level 1 diff --git a/backend/configs/tessdata/tessconfigs/nobatch b/backend/configs/tessdata/tessconfigs/nobatch new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/backend/configs/tessdata/tessconfigs/nobatch @@ -0,0 +1 @@ + diff --git a/backend/configs/tessdata/tessconfigs/segdemo b/backend/configs/tessdata/tessconfigs/segdemo new file mode 100644 index 0000000..eaff69f --- /dev/null +++ b/backend/configs/tessdata/tessconfigs/segdemo @@ -0,0 +1,9 @@ +################################################# +# Adaptive Matcher Using PreAdapted Templates +################################################# + +wordrec_display_splits 0 +wordrec_display_all_blobs 1 +wordrec_display_segmentations 2 +classify_debug_level 1 +stopper_debug_level 1 diff --git a/backend/migrations/versions/926395732c3e_raise_bonid_digits.py b/backend/migrations/versions/926395732c3e_raise_bonid_digits.py new file mode 100644 index 0000000..d4a69b3 --- /dev/null +++ b/backend/migrations/versions/926395732c3e_raise_bonid_digits.py @@ -0,0 +1,38 @@ +"""raise bonid digits + +Revision ID: 926395732c3e +Revises: 2a64d3b9235a +Create Date: 2024-08-24 10:33:39.109944 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '926395732c3e' +down_revision = '2a64d3b9235a' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('receipt', schema=None) as batch_op: + batch_op.alter_column('bonid', + existing_type=sa.NUMERIC(precision=24, scale=0), + type_=sa.Numeric(precision=28, scale=0), + existing_nullable=True) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('receipt', schema=None) as batch_op: + batch_op.alter_column('bonid', + existing_type=sa.Numeric(precision=28, scale=0), + type_=sa.NUMERIC(precision=24, scale=0), + existing_nullable=True) + + # ### end Alembic commands ### diff --git a/backend/models/receipt.py b/backend/models/receipt.py index 6fdf206..a5f54bf 100644 --- a/backend/models/receipt.py +++ b/backend/models/receipt.py @@ -4,7 +4,7 @@ from src import db class Receipt(db.Model): id = db.Column(db.BigInteger, primary_key=True, autoincrement=True) date = db.Column(db.Date, nullable=False) - bonid = db.Column(db.Numeric(precision=24, scale=0), unique=True) + bonid = db.Column(db.Numeric(precision=28, scale=0), unique=True) from_user = db.Column(db.ForeignKey("login_token.token"), server_onupdate=db.FetchedValue()) registered = db.Column(db.Boolean, nullable=False, diff --git a/backend/src/receipts/upload/routes.py b/backend/src/receipts/upload/routes.py index 1bc723a..78560ca 100644 --- a/backend/src/receipts/upload/routes.py +++ b/backend/src/receipts/upload/routes.py @@ -40,7 +40,7 @@ def upload_receipt(establishment: int): db.session.commit() if pdfReceipt: rename(f"{PDFDir}/temp.pdf", f"{PDFDir}{secure_filename(f'{dbReceipt.id}.pdf')}") - LOGGER.debug(receipt.text) + LOGGER.debug(receipt.words) return redirect(url_for("receipts.check_items.confirm_receipt_items", receipt_id = dbReceipt.id)) else: LOGGER.debug(form.errors) diff --git a/backend/src/utils/modules/receipt_parser/edeka/edeka_parser.py b/backend/src/utils/modules/receipt_parser/edeka/edeka_parser.py index e69de29..6b332a1 100644 --- a/backend/src/utils/modules/receipt_parser/edeka/edeka_parser.py +++ b/backend/src/utils/modules/receipt_parser/edeka/edeka_parser.py @@ -0,0 +1,20 @@ +from datetime import datetime + +def getDictFromWords(words: list[tuple]): + results = {"items": []} + results['date'] = datetime.strptime(words[-6][4], "%d.%m.%y").date() + results['bonid'] = words[-1][4] + currentline = 0 + skipwords = 14 + for i, word in enumerate(words[skipwords:]): + if currentline != word[5]: + results['items'].append({"itemname": word[4]}) + currentline = word[5] + elif word[6] == 0: + results['items'][-1]["itemname"] += " " + word[4] + if word[6] == 1 and word[7] == 1: + results['items'][-1]["price"] = word[4].split("*")[0] + if "----" in word[4]: + del(results['items'][-1]) + break + return results \ No newline at end of file diff --git a/backend/src/utils/modules/receipt_parser/kaufland/kaufland_parser.py b/backend/src/utils/modules/receipt_parser/kaufland/kaufland_parser.py new file mode 100644 index 0000000..11e2d78 --- /dev/null +++ b/backend/src/utils/modules/receipt_parser/kaufland/kaufland_parser.py @@ -0,0 +1,27 @@ +from datetime import datetime + +def getDictFromWords(words: list[tuple]): + results = {"items": []} + results['bonid'] = words[-1][4] + currentline = 0 + skipwords = 9 + for i, word in enumerate(words[skipwords:]): + if currentline != word[5]: + results['items'].append({"itemname": word[4]}) + currentline = word[5] + elif word[6] == 0: + results['items'][-1]["itemname"] += " " + word[4] + if word[6] == 1 and word[7] == 0: + if word[4].lower() == "x": + results['items'][-1]["amount"] = words[i+skipwords+1][4] + else: + results['items'][-1]["price"] = word[4] + elif word[6] == 2: + results['items'][-1]["price"] = word[4] + if word[4].lower() == "gesamt": + del(results['items'][-1]) + break + for i, word in enumerate(words[::-1]): + if word[4].lower() == "datum:": + results['date'] = datetime.strptime(words[::-1][i-1][4], "%d.%m.%Y").date() + return results \ No newline at end of file diff --git a/backend/src/utils/modules/receipt_parser/pdf_receipt_parser.py b/backend/src/utils/modules/receipt_parser/pdf_receipt_parser.py index af39309..1d2e0cd 100644 --- a/backend/src/utils/modules/receipt_parser/pdf_receipt_parser.py +++ b/backend/src/utils/modules/receipt_parser/pdf_receipt_parser.py @@ -1,5 +1,7 @@ import fitz from datetime import datetime, date +from .edeka.edeka_parser import getDictFromWords as edekaparser +from .kaufland.kaufland_parser import getDictFromWords as kauflandparser from re import search class PDFReceipt: @@ -10,22 +12,29 @@ class PDFReceipt: parser -- A keyword in lowercase to tell how the receipt is formated. Currently supported: 'edeka' """ - def __init__(self, bPDFFile, parser: str = "edeka") -> None: + def __init__(self, strPDFFile) -> None: try: - self.text = PDFReceipt._getTextFromPDF(bPDFFile) - self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.text, parser) + self.words = PDFReceipt._getWordsFromPDF(strPDFFile) + storename = PDFReceipt._getStoreName(self.words) + self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.words, store = storename) except: - self.text = "PDF konnte nicht geladen werden." + self.words = "PDF konnte nicht geladen werden." self.date = date.today() self.id = None self.items = [] - def _getTextFromPDF(file): + def _getWordsFromPDF(file): with fitz.open(file, filetype="pdf") as doc: - text = "" + words = [] for page in doc: - text += page.get_text() - return text.strip() + words.extend(page.get_text("words", textpage=page.get_textpage_ocr(), sort=True)) + return words + + def _getStoreName(words: list[tuple]) -> str: + for word in words: + if word[4].lower() in ("edeka", "kaufland"): + return word[4].lower() + return "unknown" def _getItemsTextFromText(text, start="", end=""): return text[text.index(start)+len(start):text.index(end)].strip() @@ -43,21 +52,23 @@ class PDFReceipt: i += 2 return resultsArr - def _getInfosFromText(text: str, parser: str = "edeka"): - if parser.lower() == "edeka": - items = PDFReceipt._convertItemsTextToDict(PDFReceipt._getItemsTextFromText(text, start="EUR", end="----------")) - strDate = text.split("\n")[-1].split(" ")[0] - date = datetime.strptime(strDate, "%d.%m.%y").date() - strReceiptNumber = text.split("\n")[-1].split(" ")[-1] - try: - intReceiptNumber = int(strReceiptNumber) - except: - raise ValueError("Receipt Number not an integer.") + def _getInfosFromText(words: str, store: str = "edeka"): + if store == "edeka": + result = edekaparser(words) + elif store == "kaufland": + result = kauflandparser(words) + items = result.get("items") + date = result.get("date") + strReceiptNumber = result.get("bonid") + try: + intReceiptNumber = int(strReceiptNumber) + except: + raise ValueError("Receipt Number not an integer.") return (intReceiptNumber, date, items) - def getPDFReceiptFromFile(strPDFFile: str, parser: str = "edeka"): + def getPDFReceiptFromFile(strPDFFile: str): try: with open(strPDFFile) as doc: - return PDFReceipt(doc, parser) + return PDFReceipt(doc) except FileNotFoundError as e: return PDFReceipt(None) \ No newline at end of file