Source code for remarking.highlight_extractor.remarkable_highlight_extractor
import functools
import json
import logging
import os
import typing as T
from dataclasses import dataclass
from typing import Dict, List
from remarking import models
from remarking.highlight_extractor import highlight_extractor
@dataclass
class RawHighlight:
""" Represent remarkable raw highlight entry"""
start: int
length: int
text: str
def get_page_number_mapping(working_path: str, doc_id: str) -> T.Optional[Dict[str, int]]:
""" Return a mapping of page id to page number or None if .content metadata file could not be found """
contents_path = os.path.join(working_path, f"{doc_id}.content")
if not os.path.exists(contents_path):
logging.info(f"Could not find a contents file at {contents_path}")
return None
with open(contents_path, "r") as contents_file:
page_ids = json.load(contents_file)["pages"]
return {page_id: ind for ind, page_id in enumerate(page_ids)}
def create_raw_highlight(raw_highlight_data: Dict[str, T.Any]) -> RawHighlight:
""" Create and return a raw highlight file given raw_highlight_data. """
return RawHighlight(
start=raw_highlight_data['start'],
length=raw_highlight_data['length'],
text=raw_highlight_data['text']
)
def get_raw_highlights_by_page(working_path: str,
doc_id: str) -> T.Optional[Dict[str, List[RawHighlight]]]:
""" Return raw highlights by page id for a given working_path that contains the passed document id. """
raw_highlights: T.Dict[str, List[RawHighlight]] = {}
highlights_path = os.path.join(working_path, f"{doc_id}.highlights")
if not os.path.exists(highlights_path):
logging.info(f"Could not find a highlights folder at {highlights_path}")
return None
highlights_files = os.listdir(highlights_path)
for highlight_file in highlights_files:
page_id = highlight_file.replace(".json", "")
page_highlight_path = os.path.join(highlights_path, highlight_file)
with open(page_highlight_path, "r") as highlights_file:
highlights_by_layer: List[List[Dict[str, T.Any]]] = json.load(highlights_file)['highlights']
raw_highlights_json: List[Dict[str, T.Any]] = functools.reduce(
lambda l, r: l + r, highlights_by_layer, [])
for raw_json in raw_highlights_json:
raw_json['text'] = highlight_extractor.clean_highlight_text(raw_json['text'])
raw_highlights[page_id] = [create_raw_highlight(raw_json) for raw_json in raw_highlights_json]
raw_highlights[page_id] = sorted(raw_highlights[page_id], key=lambda x: x.start)
return raw_highlights
[docs]class RemarkableHighlightExtractor(highlight_extractor.HighlightExtractor):
""" Extracts highlights from the ``highlights`` folder of reMarkable documents. """
@classmethod
def get_extractor_instance_data(cls) -> List[highlight_extractor.ExtractorData]:
return [
highlight_extractor.ExtractorData(
extractor_name="remarkable",
instance=cls(),
description=cls.__doc__
)
]
def get_highlights(self, working_path: str, document: models.Document) -> List[models.Highlight]:
logging.info("Getting highlights from remarkable")
extracted_highlight = []
page_id_to_page_num = get_page_number_mapping(working_path, document.id)
if page_id_to_page_num is None:
logging.info(f"Failed to get page_id_to_page_num mapping for {document.id}")
return []
raw_highlights_by_page = get_raw_highlights_by_page(working_path, document.id)
if raw_highlights_by_page is None:
logging.info(f"Failed to get raw highlights for {document.id}")
return []
extracted_highlight.extend(
self._from_raw_highlights(
document.id, page_id_to_page_num, raw_highlights_by_page
)
)
return extracted_highlight
def _from_raw_highlights(self,
doc_id: str,
page_id_to_page_num: Dict[str, int],
raw_highlights_by_page: Dict[str, List[RawHighlight]]) -> List[models.Highlight]:
""" Create Highlights from a list of RawHighlight """
# TODO: This can join across pages most likely by checking lengths...
# Need a way to know the length of a page in characters
highlight_recs: T.List[models.Highlight] = []
for page_id, page_raw_highlights in raw_highlights_by_page.items():
highlight_text = page_raw_highlights[0].text if len(page_raw_highlights) > 1 else ""
highlight_page = page_id_to_page_num[page_id]
for i in range(1, len(page_raw_highlights)):
prev_highlight = page_raw_highlights[i - 1]
cur_highlight = page_raw_highlights[i]
last_ending_index = prev_highlight.start + prev_highlight.length
cur_ending_index = cur_highlight.start
diff = cur_ending_index - last_ending_index
if diff > 3:
# if our highlights distance is more than 3 character lets commit what we have
# as a highlight. Distance of 3 allows us to join across lines.
highlight_recs.append(
models.Highlight.create_highlight(
doc_id,
highlight_extractor.clean_highlight_text(highlight_text),
highlight_page,
self.__class__.__name__
)
)
highlight_text = page_raw_highlights[i].text
highlight_page = page_id_to_page_num[page_id]
elif diff < 0:
# highlights overlap
highlight_text = (
highlight_text[:diff] +
cur_highlight.text +
(highlight_text[len(highlight_text) + diff + len(cur_highlight.text):]
if abs(diff) > len(cur_highlight.text) else ""
)
)
else: # diff == 0
# Highlights start and end at the same spot
highlight_text += " " + cur_highlight.text
highlight_recs.append(
models.Highlight.create_highlight(
doc_id,
highlight_extractor.clean_highlight_text(highlight_text),
highlight_page,
self.__class__.__name__
)
)
return highlight_recs