Parsers
The first step of working with Parsers is to implement all the parsers you need based on one of three provided base classes: PDFBaseParser
, EmailBaseParser
and FileBaseParser
.
The second step is to create an instance of ParserHandler
, which automatically loads all the parsers based on provided input arguments.
And finally, to parse our input data, we can call parse()
method, which will select the correct parser and parse the input data based on provided Parsing Strategy.
See parameter parse_strategy
in ParserHandler
.
- class aiviro.modules.parser.PDFBaseParser(supplier_name: str | None = None)
Base class to inherit from for creating PDF Parser.
- Example:
>>> import aiviro >>> from typing import Any >>> from dataclasses import dataclass >>> from aiviro.modules.pdf import PDFRobot >>> from aiviro.modules.parser import PDFBaseParser, ParserInvalidCheck >>> >>> @dataclass ... class DataOut: ... something: int ... >>> class SupplierParser(PDFBaseParser[DataOut]): ... def __init__(self): ... super().__init__("supplier name") ... ... def parse(self, r: PDFRobot) -> DataOut: ... if not r.get(aiviro.And( ... aiviro.Text("Invoice"), ... aiviro.Text("Supplier Name") ... )): ... raise ParserInvalidCheck ... ... # additional code ... return DataOut(42)
- class aiviro.modules.parser.EmailBaseParser(supplier_name: str | None = None)
Base class to inherit from for creating Email Parser.
- Example:
>>> import aiviro >>> from typing import Any >>> from aiviro.modules.email import IMAPMessage >>> from aiviro.modules.parser import EmailBaseParser, ParserInvalidCheck >>> >>> class SupplierParser(EmailBaseParser[str]): ... def __init__(self): ... super().__init__("supplier name") ... ... def parse(self, email: IMAPMessage) -> str: ... if email.subject.find("Supplier name") == -1: ... raise ParserInvalidCheck ... ... # additional code ... return email.subject
- class aiviro.modules.parser.FileBaseParser(supplier_name: str | None = None)
Base class to inherit from for creating File Parser.
- Example:
>>> import aiviro >>> import pathlib >>> from typing import Any >>> from aiviro.modules.parser import FileBaseParser, ParserInvalidCheck >>> >>> class SupplierParser(EmailBaseParser[str]): ... def __init__(self): ... super().__init__("supplier name") ... ... def parse(self, file_path: Union[str, pathlib.Path]) -> str: ... lib_path = pathlib.Path(file_path) # convert into Path object ... if lib_path.stem.find("Supplier name") == -1: ... raise ParserInvalidCheck ... ... # additional code ... return lib_path.name
- class aiviro.modules.parser.ParserHandler(parser_module: ~types.ModuleType | None, ignorable_python_files: ~typing.List[str] | None = None, ignorable_parser_objects: ~typing.List[~typing.Type[~aiviro.modules.parser.base.BaseLoadable]] | None = None, parse_strategy: ~aiviro.modules.parser.strategy.BaseParsingStrategy | None = None, loader: ~typing.Type[~aiviro.modules.parser.base.BaseLoader] = <class 'aiviro.modules.parser.loader.DefaultLoader'>)
Parser handler which loads and instantiate Parsers from provided directory. It provides interface to select correct parser for processing the input data.
- Parameters:
parser_module – Python module with parsers to load
ignorable_python_files – Python source files excluded from module loading
ignorable_parser_objects – Parser objects excluded from module loading
parse_strategy – Strategy based on which the correct parser is selected, if None,
NaiveParsingStrategy
is used
- Example:
>>> from aiviro.modules.parser import ParserHandler >>> from aiviro.modules.pdf import create_pdf_robot >>> import src.parsers.pdf >>> handler = ParserHandler( ... src.parsers.pdf ... ) >>> r = create_pdf_robot("path/to/file.pdf") >>> r.set_as_stream_source() >>> result, parser = handler.parse(r) >>> parser.supplier_name "supplier name"
>>> from aiviro.modules.email import EmailClient >>> from src.common import CustomBasePDFParser >>> import src.parsers.email >>> handler = ParserHandler( ... src.parsers.email, ... ["ignore-file.py"], ... [CustomBasePDFParser] ... ) >>> client = EmailClient() >>> client.setup_imap_basic_auth("<IMAP_SERVER>", "<EMAIL_ADDRESS>", "<EMAIL_PASSWORD>") >>> for email in client.inbox.all(): ... result, parser = handler.parse(email)
>>> import src.parsers.files >>> handler = ParserHandler( ... src.parsers.files, ... ["common.py"] ... ) >>> result, parser = handler.parse("path/to/file")
- add_module(parser_module: ModuleType, ignorable_python_files: List[str] | None = None, ignorable_parser_objects: List[Type[BaseLoadable]] | None = None) None
Loads parsers from python module.
- Parameters:
parser_module – Python module with parsers to load
ignorable_python_files – Python source files excluded from module loading
ignorable_parser_objects – Parser objects excluded from module loading
- add_parser(parser: BaseLoadable | Type[BaseLoadable]) None
Add a parser.
- Parameters:
parser – Instance of the parser or type of parser to instantiate
- class aiviro.modules.parser.BaseParsingStrategy
- abstract parse(*args: Any, parsers: List[BaseParser]) Tuple[Any, BaseParser]
Passes the input data through input list of Parsers and select the valid one.
- Parameters:
*args – Arguments passed to ‘parse’ method of loaded parsers
parsers – List of parsers used for parsing
- Returns:
Tuple of result-data obtained from parser and selected parser
- class aiviro.modules.parser.NaiveParsingStrategy
Naive and default parsing strategy for
ParserHandler
. Strategy goes through all the parsers and select the first one which don’t raiseParserInvalidCheck
exception.- parse(*args: Any, parsers: List[BaseParser]) Tuple[Any, BaseParser]
Passes the input data through input list of Parsers and select the valid one.
- Parameters:
*args – Arguments passed to ‘parse’ method of loaded parsers
parsers – List of parsers used for parsing
- Raises:
ParserProcessingError – When parser encountered some error
ParserNotFound – If no parser was selected
- Returns:
Tuple of result-data obtained from parser and selected parser
- class aiviro.modules.parser.AutoSelectParsingStrategy(strategy: BaseParsingStrategy | None = None)
Strategy selects automatically appropriate type of parses based on the type of input argument. It checks the type of the first argument
args[0]
:if type is
PDFRobot
it selectsPDFBaseParser
parsersif type is
IMAPMessage
it selectsEmailBaseParser
parsersif type is
str
orpathlib.Path
it selectsFileBaseParser
parsersif none of the condition is met, it selects all provided parsers
The parsing logic is then handled by
NaiveParsingStrategy
.- Parameters:
strategy – Parser that handles data-parsing logic, if None,
NaiveParsingStrategy
is used
- parse(*args: Any, parsers: List[BaseParser]) Tuple[Any, BaseParser]
Selects appropriate type of the parses based on the type of input argument. And then it passes into
NaiveParsingStrategy
.- Parameters:
*args – Arguments used for selecting correct parsers
parsers – List of parsers to select from
- Returns:
Tuple of result-data obtained from parser and selected parser
- exception aiviro.modules.parser.ParserInvalidCheck
- exception aiviro.modules.parser.ParserNotFound
- exception aiviro.modules.parser.ParserProcessingError(*args: Any, parser: BaseParser | None = None, parsing_data: Any = None)
Examples
Here is an example of how to implement several PDF parsers, where each parser is responsible for parsing a specific type of PDF file. In this case, each supplier sends us a PDF file with a different structure, and we need to parse the data from these files.
from dataclasses import dataclass
from decimal import Decimal
import aiviro
from aiviro.modules.parser import ParserHandler, ParserInvalidCheck, PDFBaseParser
from aiviro.modules.pdf import PDFRobot, create_pdf_robot
@dataclass(frozen=True)
class Document:
supplier_id: str
document_id: str
total_amount: Decimal
class Supplier1Parser(PDFBaseParser[Document]):
def __init__(self):
super().__init__("supplier1")
def parse(self, r: "PDFRobot") -> Document:
if not r.get(aiviro.Text("Supplier1")):
raise ParserInvalidCheck
supplier_id = r.see(
aiviro.OnTheRight(aiviro.Text(""), aiviro.Text("Supplier ID"))
)
document_id = r.see(
aiviro.OnTheRight(aiviro.RegexText(r"(\d+)"), aiviro.Text("Document ID"))
)
total_amount = r.see(
aiviro.OnTheRight(
aiviro.RegexText(r"(\d+\.\d+)"), aiviro.Text("Total Amount")
)
)
return Document(
supplier_id=supplier_id.text,
document_id=document_id.regex_match.group(1),
total_amount=Decimal(total_amount.regex_match.group(1)),
)
class Supplier2Parser(PDFBaseParser[Document]):
def __init__(self):
super().__init__("supplier2")
def parse(self, r: "PDFRobot") -> Document:
if not r.get(aiviro.Text("Supplier2")):
raise ParserInvalidCheck
supplier_id = r.see(
aiviro.OnTheRight(aiviro.Text("(\d+)"), aiviro.Text("Identification"))
)
document_id = r.see(
aiviro.OnTheRight(aiviro.RegexText(r"(\d+)"), aiviro.Text("Note ID"))
)
total_amount = r.see(
aiviro.OnTheRight(
aiviro.RegexText(r"(\d+\,\d+)"), aiviro.Text("Total Payment")
)
)
return Document(
supplier_id=supplier_id.regex_match.group(1),
document_id=document_id.regex_match.group(1),
total_amount=Decimal(total_amount.regex_match.group(1)),
)
if __name__ == "__main__":
p_handler = ParserHandler(None)
p_handler.add_parser(Supplier1Parser())
p_handler.add_parser(Supplier2Parser())
r = create_pdf_robot("path/to/file.pdf")
result, parser = p_handler.parse(r)
print(f"Selected Parser: {parser.supplier_name}")
print(f"Result: {result}")