Refactored most imports to relative paths and made some experimental changes to angular.
54 lines
2.0 KiB
Python
54 lines
2.0 KiB
Python
import fitz
|
|
from datetime import datetime
|
|
from re import search
|
|
|
|
class PDFReceipt:
|
|
"""Class to use a PDF-Receipt as an object.
|
|
|
|
Arguments:
|
|
strPDFFile -- The path to the PDF-File as a string.
|
|
parser -- A keyword in lowercase to tell how the receipt is formated.
|
|
Currently supported: 'edeka'
|
|
"""
|
|
def __init__(self, bPDFFile, parser: str = "edeka") -> None:
|
|
self.text = PDFReceipt._getTextFromPDF(bPDFFile)
|
|
self.id, self.date, self.items = PDFReceipt._getInfosFromText(self.text, parser)
|
|
|
|
def _getTextFromPDF(file):
|
|
with fitz.open(file, filetype="pdf") as doc:
|
|
text = ""
|
|
for page in doc:
|
|
text += page.get_text()
|
|
return text.strip()
|
|
|
|
def _getItemsTextFromText(text, start="", end=""):
|
|
return text[text.index(start)+len(start):text.index(end)].strip()
|
|
|
|
def _convertItemsTextToDict(text):
|
|
temp = text.split("\n")
|
|
resultsArr = []
|
|
i = 0
|
|
while i < len(temp):
|
|
if search("(\d+) x", temp[i]):
|
|
resultsArr.append({"itemname": temp[i+2], "price": temp[i+1], "amount": temp[i][:-2]})
|
|
i += 4
|
|
else:
|
|
resultsArr.append({"itemname": temp[i], "price": temp[i+1][:-2]})
|
|
i += 2
|
|
return resultsArr
|
|
|
|
def _getInfosFromText(text: str, parser: str = "edeka"):
|
|
if parser.lower() == "edeka":
|
|
items = PDFReceipt._convertItemsTextToDict(PDFReceipt._getItemsTextFromText(text, start="EUR", end="----------"))
|
|
strDate = text.split("\n")[-1].split(" ")[0]
|
|
date = datetime.strptime(strDate, "%d.%m.%y").date()
|
|
strReceiptNumber = text.split("\n")[-1].split(" ")[-1]
|
|
try:
|
|
intReceiptNumber = int(strReceiptNumber)
|
|
except:
|
|
raise ValueError("Receipt Number not an integer.")
|
|
return (intReceiptNumber, date, items)
|
|
|
|
def getPDFReceiptFromFile(strPDFFile: str, parser: str = "edeka"):
|
|
with open(strPDFFile) as doc:
|
|
return PDFReceipt(doc, parser) |