Source code for remarking.highlight_extractor.highlight_extractor

from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from typing import List

from remarking import models


[docs]@dataclass
class ExtractorData:
    """ Represents an extractor mapping entry.

        This is used by remarking to generate extractor choices for the end user.

        :param extractor_name:  The name of the extractor on the command line.
        :param instance: An instance of the extractor.
        :param description: A description for the extractor. This is shown when running
                            ``remarking list extractors``
    """
    extractor_name: str
    instance: 'HighlightExtractor'
    description: str


[docs]class HighlightExtractor(metaclass=ABCMeta):  # pylint: disable=too-few-public-methods
    """ Base class for highlight extractors.

        Extractors are run after documents are downloaded. remarking calls
        :meth:`HighlightExtractor.get_highlights` for each document downloaded.


        For example, :class:`RemarkableHighlightExtractor` is will extract the
        highlgihts from the built-in reMarkable highlighting functionality.
    """

[docs]    @classmethod
    @abstractmethod
    def get_extractor_instance_data(cls) -> List[ExtractorData]:
        """ Return a list of :class:`ExtractorData` instaces representing
            different run options for the extractor.
        """

[docs]    @abstractmethod
    def get_highlights(self, working_path: str, document: models.Document) -> List[models.Highlight]:
        """ Retrieve all highlights for document.

        :param working_path: The path on the operating system where all documents were downloaded. Documents
            are downloaded from the cloud and unzipped into this repository. For more information on the layout
            check out `<https://remarkablewiki.com/tech/filesystem#user_data_directory_structure>`_.

        :param document: The document to extract highlights for.

        :return: A list of highlights for the document.
        """


def clean_highlight_text(text: str) -> str:
    """ Return a cleaned version of the passed text. """
    to_replace = [
        ("“", "\""),
        ("‘", "'"),
        ("’", "'"),
        ("”", "\"")
    ]
    cleaned = text
    for replacements in to_replace:
        cleaned = cleaned.replace(replacements[0], replacements[1])
    return cleaned