Parsers

The first step of working with Parsers is to implement all the parsers you need based on one of three provided base classes: PDFBaseParser, EmailBaseParser and FileBaseParser.

The second step is to create an instance of ParserHandler, which automatically loads all the parsers based on provided input arguments.

And finally, to parse our input data, we can call parse() method, which will select the correct parser and parse the input data based on provided Parsing Strategy. See parameter parse_strategy in ParserHandler.

class aiviro.modules.parser.PDFBaseParser(supplier_name: str | None = None)

Base class to inherit from for creating PDF Parser.

Example:

>>> import aiviro
>>> from typing import Any
>>> from dataclasses import dataclass
>>> from aiviro.modules.pdf import PDFRobot
>>> from aiviro.modules.parser import PDFBaseParser, ParserInvalidCheck
>>>
>>> @dataclass
... class DataOut:
...     something: int
...
>>> class SupplierParser(PDFBaseParser[DataOut]):
...     def __init__(self):
...         super().__init__("supplier name")
...
...     def parse(self, r: PDFRobot) -> DataOut:
...         if not r.get(aiviro.And(
...             aiviro.Text("Invoice"),
...             aiviro.Text("Supplier Name")
...         )):
...             raise ParserInvalidCheck
...
...         # additional code
...         return DataOut(42)
classmethod enabled() bool

Check if the parser is enabled and can be used. By default, the parser is enabled.

classmethod priority() int

Priority for selecting the parser. The higher the value, the higher the priority. By default, the priority is 0.

class aiviro.modules.parser.EmailBaseParser(supplier_name: str | None = None)

Base class to inherit from for creating Email Parser.

Example:

>>> import aiviro
>>> from typing import Any
>>> from aiviro.modules.email import IMAPMessage
>>> from aiviro.modules.parser import EmailBaseParser, ParserInvalidCheck
>>>
>>> class SupplierParser(EmailBaseParser[str]):
...     def __init__(self):
...         super().__init__("supplier name")
...
...     def parse(self, email: IMAPMessage) -> str:
...         if email.subject.find("Supplier name") == -1:
...             raise ParserInvalidCheck
...
...         # additional code
...         return email.subject
classmethod enabled() bool

Check if the parser is enabled and can be used. By default, the parser is enabled.

classmethod priority() int

Priority for selecting the parser. The higher the value, the higher the priority. By default, the priority is 0.

class aiviro.modules.parser.FileBaseParser(supplier_name: str | None = None)

Base class to inherit from for creating File Parser.

Example:

>>> import aiviro
>>> import pathlib
>>> from typing import Any
>>> from aiviro.modules.parser import FileBaseParser, ParserInvalidCheck
>>>
>>> class SupplierParser(EmailBaseParser[str]):
...     def __init__(self):
...         super().__init__("supplier name")
...
...     def parse(self, file_path: Union[str, pathlib.Path]) -> str:
...         lib_path = pathlib.Path(file_path)  # convert into Path object
...         if lib_path.stem.find("Supplier name") == -1:
...             raise ParserInvalidCheck
...
...         # additional code
...         return lib_path.name
classmethod enabled() bool

Check if the parser is enabled and can be used. By default, the parser is enabled.

classmethod priority() int

Priority for selecting the parser. The higher the value, the higher the priority. By default, the priority is 0.

class aiviro.modules.parser.ParserHandler(parser_module: ~types.ModuleType | None, ignorable_python_files: ~typing.List[str] | None = None, ignorable_parser_objects: ~typing.List[~typing.Type[~aiviro.modules.parser.base.BaseLoadable]] | None = None, parse_strategy: ~aiviro.modules.parser.strategy.BaseParsingStrategy | None = None, loader: ~typing.Type[~aiviro.modules.parser.base.BaseLoader] = <class 'aiviro.modules.parser.loader.DefaultLoader'>)

Parser handler which loads and instantiate Parsers from provided directory. It provides interface to select correct parser for processing the input data.

Parameters:
  • parser_module – Python module with parsers to load

  • ignorable_python_files – Python source files excluded from module loading

  • ignorable_parser_objects – Parser objects excluded from module loading

  • parse_strategy – Strategy based on which the correct parser is selected, if None, NaiveParsingStrategy is used

Example:

>>> from aiviro.modules.parser import ParserHandler
>>> from aiviro.modules.pdf import create_pdf_robot
>>> import src.parsers.pdf
>>> handler = ParserHandler(
...     src.parsers.pdf
... )
>>> r = create_pdf_robot("path/to/file.pdf")
>>> r.set_as_stream_source()
>>> result, parser = handler.parse(r)
>>> parser.supplier_name
"supplier name"
>>> from aiviro.modules.email import EmailClient
>>> from src.common import CustomBasePDFParser
>>> import src.parsers.email
>>> handler = ParserHandler(
...     src.parsers.email,
...     ["ignore-file.py"],
...     [CustomBasePDFParser]
... )
>>> client = EmailClient()
>>> client.setup_imap_basic_auth("<IMAP_SERVER>", "<EMAIL_ADDRESS>", "<EMAIL_PASSWORD>")
>>> for email in client.inbox.all():
...     result, parser = handler.parse(email)
>>> import src.parsers.files
>>> handler = ParserHandler(
...     src.parsers.files,
...     ["common.py"]
... )
>>> result, parser = handler.parse("path/to/file")
add_module(parser_module: ModuleType, ignorable_python_files: List[str] | None = None, ignorable_parser_objects: List[Type[BaseLoadable]] | None = None) None

Loads parsers from python module.

Parameters:
  • parser_module – Python module with parsers to load

  • ignorable_python_files – Python source files excluded from module loading

  • ignorable_parser_objects – Parser objects excluded from module loading

add_parser(parser: BaseLoadable | Type[BaseLoadable]) None

Add a parser.

Parameters:

parser – Instance of the parser or type of parser to instantiate

property parsers: List[BaseParser[T]]

All parsers loaded by the handler.

Returns:

List of parsers, sorted by their priority

parse(*args: Any) Tuple[T, BaseParser[T]]

Passes the input data into provided parsing strategy.

Parameters:

*args – Arguments passed to parsing strategy object

Returns:

Tuple of result data obtained from parser and selected parser

class aiviro.modules.parser.BaseParsingStrategy
abstract parse(*args: Any, parsers: List[BaseParser]) Tuple[Any, BaseParser]

Passes the input data through input list of Parsers and select the valid one.

Parameters:
  • *args – Arguments passed to ‘parse’ method of loaded parsers

  • parsers – List of parsers used for parsing

Returns:

Tuple of result-data obtained from parser and selected parser

class aiviro.modules.parser.NaiveParsingStrategy

Naive and default parsing strategy for ParserHandler. Strategy goes through all the parsers and select the first one which don’t raise ParserInvalidCheck exception.

parse(*args: Any, parsers: List[BaseParser]) Tuple[Any, BaseParser]

Passes the input data through input list of Parsers and select the valid one.

Parameters:
  • *args – Arguments passed to ‘parse’ method of loaded parsers

  • parsers – List of parsers used for parsing

Raises:
Returns:

Tuple of result-data obtained from parser and selected parser

class aiviro.modules.parser.AutoSelectParsingStrategy(strategy: BaseParsingStrategy | None = None)

Strategy selects automatically appropriate type of parses based on the type of input argument. It checks the type of the first argument args[0]:

The parsing logic is then handled by NaiveParsingStrategy.

Parameters:

strategy – Parser that handles data-parsing logic, if None, NaiveParsingStrategy is used

parse(*args: Any, parsers: List[BaseParser]) Tuple[Any, BaseParser]

Selects appropriate type of the parses based on the type of input argument. And then it passes into NaiveParsingStrategy.

Parameters:
  • *args – Arguments used for selecting correct parsers

  • parsers – List of parsers to select from

Returns:

Tuple of result-data obtained from parser and selected parser

exception aiviro.modules.parser.ParserInvalidCheck
exception aiviro.modules.parser.ParserNotFound
exception aiviro.modules.parser.ParserProcessingError(*args: Any, parser: BaseParser | None = None, parsing_data: Any = None)

Examples

Here is an example of how to implement several PDF parsers, where each parser is responsible for parsing a specific type of PDF file. In this case, each supplier sends us a PDF file with a different structure, and we need to parse the data from these files.

from dataclasses import dataclass
from decimal import Decimal

import aiviro
from aiviro.modules.parser import ParserHandler, ParserInvalidCheck, PDFBaseParser
from aiviro.modules.pdf import PDFRobot, create_pdf_robot


@dataclass(frozen=True)
class Document:
    supplier_id: str
    document_id: str
    total_amount: Decimal


class Supplier1Parser(PDFBaseParser[Document]):
    def __init__(self):
        super().__init__("supplier1")

    def parse(self, r: "PDFRobot") -> Document:
        if not r.get(aiviro.Text("Supplier1")):
            raise ParserInvalidCheck

        supplier_id = r.see(
            aiviro.OnTheRight(aiviro.Text(""), aiviro.Text("Supplier ID"))
        )
        document_id = r.see(
            aiviro.OnTheRight(aiviro.RegexText(r"(\d+)"), aiviro.Text("Document ID"))
        )
        total_amount = r.see(
            aiviro.OnTheRight(
                aiviro.RegexText(r"(\d+\.\d+)"), aiviro.Text("Total Amount")
            )
        )

        return Document(
            supplier_id=supplier_id.text,
            document_id=document_id.regex_match.group(1),
            total_amount=Decimal(total_amount.regex_match.group(1)),
        )


class Supplier2Parser(PDFBaseParser[Document]):
    def __init__(self):
        super().__init__("supplier2")

    def parse(self, r: "PDFRobot") -> Document:
        if not r.get(aiviro.Text("Supplier2")):
            raise ParserInvalidCheck

        supplier_id = r.see(
            aiviro.OnTheRight(aiviro.Text("(\d+)"), aiviro.Text("Identification"))
        )
        document_id = r.see(
            aiviro.OnTheRight(aiviro.RegexText(r"(\d+)"), aiviro.Text("Note ID"))
        )
        total_amount = r.see(
            aiviro.OnTheRight(
                aiviro.RegexText(r"(\d+\,\d+)"), aiviro.Text("Total Payment")
            )
        )

        return Document(
            supplier_id=supplier_id.regex_match.group(1),
            document_id=document_id.regex_match.group(1),
            total_amount=Decimal(total_amount.regex_match.group(1)),
        )


if __name__ == "__main__":
    p_handler = ParserHandler(None)
    p_handler.add_parser(Supplier1Parser())
    p_handler.add_parser(Supplier2Parser())

    r = create_pdf_robot("path/to/file.pdf")
    result, parser = p_handler.parse(r)

    print(f"Selected Parser: {parser.supplier_name}")
    print(f"Result: {result}")