import fitz from datetime import date from .edeka.edeka_parser import getDictFromWords as edekaparser from .kaufland.kaufland_parser import getDictFromWords as kauflandparser from re import search class PDFReceipt: """Class to use a PDF-Receipt as an object. Arguments: strPDFFile -- The path to the PDF-File as a string. parser -- A keyword in lowercase to tell how the receipt is formated. Currently supported: 'edeka' """ def __init__(self, strPDFFile) -> None: try: self.words = PDFReceipt._getWordsFromPDF(strPDFFile) storename = PDFReceipt._getStoreName(self.words) self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.words, store = storename) except: self.words = "PDF konnte nicht geladen werden." self.date = date.today() self.id = None self.items = [] def _getWordsFromPDF(file): with fitz.open(file, filetype="pdf") as doc: words = [] for page in doc: words.extend(page.get_text("words", textpage=page.get_textpage_ocr(language = 'deu'), sort=True)) return words def _getStoreName(words: list[tuple]) -> str: for word in words: if word[4].lower() in ("edeka", "kaufland"): return word[4].lower() return "unknown" def _getInfosFromText(words: str, store: str = "edeka"): if store == "edeka": result = edekaparser(words) elif store == "kaufland": result = kauflandparser(words) items = result.get("items") date = result.get("date") strReceiptNumber = result.get("bonid") try: intReceiptNumber = int(strReceiptNumber) except: raise ValueError("Receipt Number not an integer.") return (intReceiptNumber, date, items) def getPDFReceiptFromFile(strPDFFile: str): try: with open(strPDFFile) as doc: return PDFReceipt(doc) except FileNotFoundError as e: return PDFReceipt(None)