58 lines
2.1 KiB
Python
58 lines
2.1 KiB
Python
import fitz
|
|
from datetime import date
|
|
from .edeka.edeka_parser import getDictFromWords as edekaparser
|
|
from .kaufland.kaufland_parser import getDictFromWords as kauflandparser
|
|
from re import search
|
|
|
|
class PDFReceipt:
|
|
"""Class to use a PDF-Receipt as an object.
|
|
|
|
Arguments:
|
|
strPDFFile -- The path to the PDF-File as a string.
|
|
parser -- A keyword in lowercase to tell how the receipt is formated.
|
|
Currently supported: 'edeka'
|
|
"""
|
|
def __init__(self, strPDFFile) -> None:
|
|
try:
|
|
self.words = PDFReceipt._getWordsFromPDF(strPDFFile)
|
|
storename = PDFReceipt._getStoreName(self.words)
|
|
self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.words, store = storename)
|
|
except:
|
|
self.words = "PDF konnte nicht geladen werden."
|
|
self.date = date.today()
|
|
self.id = None
|
|
self.items = []
|
|
|
|
def _getWordsFromPDF(file):
|
|
with fitz.open(file, filetype="pdf") as doc:
|
|
words = []
|
|
for page in doc:
|
|
words.extend(page.get_text("words", textpage=page.get_textpage_ocr(language = 'deu'), sort=True))
|
|
return words
|
|
|
|
def _getStoreName(words: list[tuple]) -> str:
|
|
for word in words:
|
|
if word[4].lower() in ("edeka", "kaufland"):
|
|
return word[4].lower()
|
|
return "unknown"
|
|
|
|
def _getInfosFromText(words: str, store: str = "edeka"):
|
|
if store == "edeka":
|
|
result = edekaparser(words)
|
|
elif store == "kaufland":
|
|
result = kauflandparser(words)
|
|
items = result.get("items")
|
|
date = result.get("date")
|
|
strReceiptNumber = result.get("bonid")
|
|
try:
|
|
intReceiptNumber = int(strReceiptNumber)
|
|
except:
|
|
raise ValueError("Receipt Number not an integer.")
|
|
return (intReceiptNumber, date, items)
|
|
|
|
def getPDFReceiptFromFile(strPDFFile: str):
|
|
try:
|
|
with open(strPDFFile) as doc:
|
|
return PDFReceipt(doc)
|
|
except FileNotFoundError as e:
|
|
return PDFReceipt(None) |