Source code for remarking.highlight_extractor.remarkable_highlight_extractor

import functools
import json
import logging
import os
import typing as T
from dataclasses import dataclass
from typing import Dict, List

from remarking import models
from remarking.highlight_extractor import highlight_extractor


@dataclass
class RawHighlight:
    """ Represent remarkable raw highlight entry"""
    start: int
    length: int
    text: str


def get_page_number_mapping(working_path: str, doc_id: str) -> T.Optional[Dict[str, int]]:
    """ Return a mapping of page id to page number or None if .content metadata file could not be found """
    contents_path = os.path.join(working_path, f"{doc_id}.content")
    if not os.path.exists(contents_path):
        logging.info(f"Could not find a contents file at {contents_path}")
        return None
    with open(contents_path, "r") as contents_file:
        page_ids = json.load(contents_file)["pages"]
    return {page_id: ind for ind, page_id in enumerate(page_ids)}


def create_raw_highlight(raw_highlight_data: Dict[str, T.Any]) -> RawHighlight:
    """ Create and return a raw highlight file given raw_highlight_data. """
    return RawHighlight(
        start=raw_highlight_data['start'],
        length=raw_highlight_data['length'],
        text=raw_highlight_data['text']
    )


def get_raw_highlights_by_page(working_path: str,
                               doc_id: str) -> T.Optional[Dict[str, List[RawHighlight]]]:
    """ Return raw highlights by page id for a given working_path that contains the passed document id. """
    raw_highlights: T.Dict[str, List[RawHighlight]] = {}

    highlights_path = os.path.join(working_path, f"{doc_id}.highlights")

    if not os.path.exists(highlights_path):
        logging.info(f"Could not find a highlights folder at {highlights_path}")
        return None

    highlights_files = os.listdir(highlights_path)
    for highlight_file in highlights_files:
        page_id = highlight_file.replace(".json", "")
        page_highlight_path = os.path.join(highlights_path, highlight_file)
        with open(page_highlight_path, "r") as highlights_file:
            highlights_by_layer: List[List[Dict[str, T.Any]]] = json.load(highlights_file)['highlights']
        raw_highlights_json: List[Dict[str, T.Any]] = functools.reduce(
            lambda l, r: l + r, highlights_by_layer, [])

        for raw_json in raw_highlights_json:
            raw_json['text'] = highlight_extractor.clean_highlight_text(raw_json['text'])

        raw_highlights[page_id] = [create_raw_highlight(raw_json) for raw_json in raw_highlights_json]
        raw_highlights[page_id] = sorted(raw_highlights[page_id], key=lambda x: x.start)
    return raw_highlights


[docs]class RemarkableHighlightExtractor(highlight_extractor.HighlightExtractor): """ Extracts highlights from the ``highlights`` folder of reMarkable documents. """ @classmethod def get_extractor_instance_data(cls) -> List[highlight_extractor.ExtractorData]: return [ highlight_extractor.ExtractorData( extractor_name="remarkable", instance=cls(), description=cls.__doc__ ) ] def get_highlights(self, working_path: str, document: models.Document) -> List[models.Highlight]: logging.info("Getting highlights from remarkable") extracted_highlight = [] page_id_to_page_num = get_page_number_mapping(working_path, document.id) if page_id_to_page_num is None: logging.info(f"Failed to get page_id_to_page_num mapping for {document.id}") return [] raw_highlights_by_page = get_raw_highlights_by_page(working_path, document.id) if raw_highlights_by_page is None: logging.info(f"Failed to get raw highlights for {document.id}") return [] extracted_highlight.extend( self._from_raw_highlights( document.id, page_id_to_page_num, raw_highlights_by_page ) ) return extracted_highlight def _from_raw_highlights(self, doc_id: str, page_id_to_page_num: Dict[str, int], raw_highlights_by_page: Dict[str, List[RawHighlight]]) -> List[models.Highlight]: """ Create Highlights from a list of RawHighlight """ # TODO: This can join across pages most likely by checking lengths... # Need a way to know the length of a page in characters highlight_recs: T.List[models.Highlight] = [] for page_id, page_raw_highlights in raw_highlights_by_page.items(): highlight_text = page_raw_highlights[0].text if len(page_raw_highlights) > 1 else "" highlight_page = page_id_to_page_num[page_id] for i in range(1, len(page_raw_highlights)): prev_highlight = page_raw_highlights[i - 1] cur_highlight = page_raw_highlights[i] last_ending_index = prev_highlight.start + prev_highlight.length cur_ending_index = cur_highlight.start diff = cur_ending_index - last_ending_index if diff > 3: # if our highlights distance is more than 3 character lets commit what we have # as a highlight. Distance of 3 allows us to join across lines. highlight_recs.append( models.Highlight.create_highlight( doc_id, highlight_extractor.clean_highlight_text(highlight_text), highlight_page, self.__class__.__name__ ) ) highlight_text = page_raw_highlights[i].text highlight_page = page_id_to_page_num[page_id] elif diff < 0: # highlights overlap highlight_text = ( highlight_text[:diff] + cur_highlight.text + (highlight_text[len(highlight_text) + diff + len(cur_highlight.text):] if abs(diff) > len(cur_highlight.text) else "" ) ) else: # diff == 0 # Highlights start and end at the same spot highlight_text += " " + cur_highlight.text highlight_recs.append( models.Highlight.create_highlight( doc_id, highlight_extractor.clean_highlight_text(highlight_text), highlight_page, self.__class__.__name__ ) ) return highlight_recs