mirror of
https://codeberg.org/Reuh/feather.git
synced 2025-10-27 18:19:32 +00:00
302 lines
13 KiB
Python
302 lines
13 KiB
Python
import os
|
|
import re
|
|
import json
|
|
import google_reader
|
|
import tomllib
|
|
import sys
|
|
import argparse
|
|
from datetime import datetime
|
|
from zoneinfo import ZoneInfo
|
|
from pathlib import Path
|
|
from hashlib import sha256
|
|
from jinja2 import Template
|
|
|
|
#%% Configuration
|
|
|
|
class Config:
|
|
def __init__(self):
|
|
with open("config.default.toml", "rb") as f:
|
|
default_config = tomllib.load(f)
|
|
|
|
config_path = Path(os.environ.get("CONFIG_PATH") or "config.toml")
|
|
if config_path.exists():
|
|
with config_path.open("rb") as f:
|
|
config = tomllib.load(f)
|
|
elif "CONFIG_PATH" in os.environ:
|
|
print(f"Configuration file {config_path} does not exist; create it or change the CONFIG_PATH environment variable to another path")
|
|
exit(1)
|
|
else:
|
|
config = {}
|
|
|
|
def get_config(category, field, can_default=True):
|
|
env_name = f"{category.upper()}_{field.upper()}"
|
|
c = config.get(category, {})
|
|
if env_name in os.environ:
|
|
return os.environ[env_name]
|
|
elif field in c:
|
|
return c[field]
|
|
elif can_default:
|
|
return default_config[category][field]
|
|
else:
|
|
print(f"Error while loading configuration: {category}.{field} not found in {config_path} nor in environment variable {env_name}", file=sys.stderr)
|
|
exit(1)
|
|
|
|
# Get config fields
|
|
self.html_root: Path = Path(get_config("directories", "reader"))
|
|
self.json_root: Path = Path(get_config("directories", "data"))
|
|
self.server_url: str = str(get_config("server", "url", False))
|
|
self.server_user: str = str(get_config("server", "user", False))
|
|
self.server_password: str = str(get_config("server", "password", False))
|
|
self.items_per_query: int = int(get_config("server", "items_per_request"))
|
|
self.timezone: ZoneInfo = ZoneInfo(str(get_config("datetime", "timezone")))
|
|
self.time_format: str = str(get_config("datetime", "format"))
|
|
self.item_template: Template = Template(str(get_config("html", "template")), autoescape=True)
|
|
self.item_filename_template: Template = Template(str(get_config("html", "filename_template")), autoescape=False)
|
|
self.max_filename_length: int = int(get_config("html", "max_filename_length"))
|
|
self.filename_translation = str.maketrans(get_config("html", "filename_replacement"))
|
|
|
|
# Computed config fields
|
|
self.update_lock = self.json_root / "update.lock"
|
|
|
|
# Create missing directories
|
|
self.html_root.mkdir(exist_ok=True)
|
|
self.json_root.mkdir(exist_ok=True)
|
|
|
|
#%% Interaction with server
|
|
|
|
label_name = re.compile("user/.*/label/(.*)")
|
|
class ClientSession:
|
|
client: google_reader.Client
|
|
auth_token: str
|
|
csrf_token: str
|
|
|
|
def __init__(self, config: Config):
|
|
self.client = google_reader.Client(config.server_url)
|
|
self.auth_token = self.client.login(config.server_user, config.server_password)
|
|
self.csrf_token = self.client.get_token(self.auth_token)
|
|
|
|
def mark_as_read(self, item_ids):
|
|
self.client.edit_tags(self.auth_token, self.csrf_token, item_ids=item_ids, add_tags=[google_reader.STREAM_READ])
|
|
|
|
def list_folders(self):
|
|
folders = [tag for tag in self.client.list_tags(self.auth_token) if tag.type == "folder"]
|
|
l = []
|
|
for folder in folders:
|
|
folder_name = folder.label or label_name.search(folder.id).group(1)
|
|
folder_id = folder.id
|
|
l.append((folder_name, folder_id))
|
|
return l
|
|
|
|
def get_stream_items_ids(self, *args, **kwargs):
|
|
return self.client.get_stream_items_ids(self.auth_token, *args, **kwargs)
|
|
|
|
def get_stream_items_contents(self, *args, **kwargs):
|
|
return self.client.get_stream_items_contents(self.auth_token, self.csrf_token, *args, **kwargs)
|
|
|
|
#%% Regular feather operations
|
|
|
|
def mark_deleted_as_read(config, client_session):
|
|
# Mark items that are in the JSON directory but with missing HTML file as read on the server
|
|
if config.update_lock.exists():
|
|
print("The previous synchronization was aborted, not marking any item as read in order to avoid collateral damage")
|
|
return
|
|
|
|
marked_as_read = 0
|
|
to_mark_as_read = []
|
|
for json_path in config.json_root.glob("*.json"):
|
|
item_json = json.load(json_path.open("r"))
|
|
html_path = config.html_root / item_json["html_path"]
|
|
if not html_path.exists():
|
|
to_mark_as_read.append(item_json["id"])
|
|
# delete JSON file
|
|
json_path.unlink()
|
|
marked_as_read += 1
|
|
|
|
for i in range(0, len(to_mark_as_read), config.items_per_query):
|
|
client_session.mark_as_read(to_mark_as_read[i:i+500])
|
|
|
|
print(f"Marked {marked_as_read} items as read")
|
|
|
|
def escape_filename(config, filename):
|
|
return filename.translate(config.filename_translation)
|
|
|
|
def truncate_filename(config, filename):
|
|
max_filename_length = config.max_filename_length
|
|
filename_utf8 = filename.encode("utf-8")
|
|
if len(filename_utf8) <= max_filename_length:
|
|
return filename
|
|
else:
|
|
suffix = Path(filename).suffix
|
|
max_basename_length = max_filename_length - len(suffix.encode("utf-8"))
|
|
cutoff = len(filename.encode('utf-8')[:max_basename_length].decode('utf-8', errors="ignore"))
|
|
return filename[:cutoff] + '…' + suffix
|
|
|
|
def get_html_path(config, item_json):
|
|
folder_directory = config.html_root / escape_filename(config, item_json["folder"])
|
|
folder_directory.mkdir(exist_ok=True)
|
|
|
|
html_name = truncate_filename(config, escape_filename(config, config.item_filename_template.render(item_json)))
|
|
|
|
return folder_directory / html_name
|
|
|
|
def format_datetime(config, timestamp):
|
|
return datetime.fromtimestamp(timestamp, config.timezone).strftime(config.time_format)
|
|
|
|
def set_computed_fields_json(config, item_json):
|
|
item_json["published_formatted"] = format_datetime(config, item_json["published"])
|
|
item_json["updated_formatted"] = format_datetime(config, item_json["updated"])
|
|
item_json["html_path"] = str(get_html_path(config, item_json).relative_to(config.html_root))
|
|
|
|
def synchronize_with_server(config, client_session):
|
|
# Synchronize items from the server, generating and deleting JSON and HTML files accordingly
|
|
config.update_lock.touch()
|
|
print("Synchronizing with server...")
|
|
|
|
new_items, updated_items = 0, 0
|
|
grabbed_item_paths = []
|
|
|
|
folders = client_session.list_folders()
|
|
for (folder_name, folder_id) in folders:
|
|
print(f" Updating folder {folder_name}")
|
|
|
|
def process(item_ids):
|
|
nonlocal new_items, updated_items, grabbed_item_paths
|
|
if len(item_ids) > 0:
|
|
item_contents = client_session.get_stream_items_contents(item_ids=item_ids)
|
|
for item_content in item_contents.items:
|
|
item_json = {
|
|
"id": item_content.id,
|
|
"folder": folder_name,
|
|
"title": item_content.title,
|
|
"published": item_content.published,
|
|
"updated": item_content.updated,
|
|
"author": item_content.author,
|
|
"summary": item_content.summary.content,
|
|
"content": item_content.content.content,
|
|
"origin_title": item_content.origin.title,
|
|
"origin_url": item_content.origin.html_url,
|
|
"canonical_url": item_content.canonical[0].href,
|
|
}
|
|
set_computed_fields_json(config, item_json)
|
|
|
|
json_path = config.json_root / f"{ sha256(item_json["id"].encode("utf-8")).hexdigest() }.json"
|
|
grabbed_item_paths.append(json_path)
|
|
|
|
write_files, updating = False, False
|
|
if not json_path.exists():
|
|
write_files = True
|
|
new_items += 1
|
|
else:
|
|
old_item_json = json.load(json_path.open("r"))
|
|
if item_json["updated"] > old_item_json["updated"]:
|
|
write_files, updating = True, True
|
|
updated_items += 1
|
|
|
|
if write_files:
|
|
# write JSON
|
|
with json_path.open("w") as f:
|
|
json.dump(item_json, f)
|
|
# write HTML
|
|
generate_html_for_item(config, item_json, regenerate=updating)
|
|
|
|
|
|
continuation = None
|
|
while continuation != '':
|
|
items = client_session.get_stream_items_ids(stream_id=folder_id, exclude_target="user/-/state/com.google/read", limit=config.items_per_query, continuation=continuation)
|
|
item_ids = [item.id for item in items.item_refs]
|
|
process(item_ids)
|
|
continuation = items.continuation
|
|
|
|
# Remove items that we didn't get from the server but are in the JSON directory
|
|
removed_items = 0
|
|
for item_path in config.json_root.glob("*.json"):
|
|
if not item_path in grabbed_item_paths:
|
|
# remove HTML
|
|
item_json = json.load(item_path.open("r"))
|
|
remove_html_for_item(config, item_json, ignore_deleted=True) # ignore if file was deleted by user during sync
|
|
# remove JSON
|
|
item_path.unlink()
|
|
removed_items += 1
|
|
|
|
print(f"Synchronization successful ({new_items} new items, {updated_items} updated, {removed_items} removed)")
|
|
config.update_lock.unlink()
|
|
|
|
def generate_html_for_item(config, item_json, regenerate=False):
|
|
# Write HTML file for a JSON object
|
|
html_path = config.html_root / item_json["html_path"]
|
|
if html_path.exists() and not regenerate:
|
|
print(f"WARNING: a file already exist for {html_path}. Either the feed has duplicate entries, or something has gone terribly wrong.")
|
|
else:
|
|
with html_path.open("w") as f:
|
|
f.write(config.item_template.render(item_json))
|
|
|
|
def remove_html_for_item(config, item_json, ignore_deleted=False):
|
|
# Delete a HTML file for a JSON object
|
|
html_path = config.html_root / item_json["html_path"]
|
|
if not ignore_deleted or html_path.exists():
|
|
html_path.unlink()
|
|
|
|
def remove_empty_html_directories(config):
|
|
# Remove empty directories in the HTML directory
|
|
html_root = config.html_root
|
|
for (dirpath, dirnames, filenames) in html_root.walk(top_down=False):
|
|
if dirpath != html_root:
|
|
if len(dirnames) == 0 and len(filenames) == 0:
|
|
dirpath.rmdir()
|
|
|
|
def synchronize(config, client_session):
|
|
# Do a full feather update
|
|
mark_deleted_as_read(config, client_session)
|
|
synchronize_with_server(config, client_session)
|
|
remove_empty_html_directories(config)
|
|
|
|
def synchronize_local_changes(config, client_session):
|
|
# Upload local changes (read items) to the server
|
|
mark_deleted_as_read(config, client_session)
|
|
remove_empty_html_directories(config)
|
|
|
|
def synchronize_remote_changes(config, client_session):
|
|
# Download remote changes (new items, items read from another device) from the server
|
|
synchronize_with_server(config, client_session)
|
|
remove_empty_html_directories(config)
|
|
|
|
def regenerate_files(config):
|
|
for json_path in config.json_root.glob("*.json"):
|
|
item_json = json.load(json_path.open("r"))
|
|
remove_html_for_item(config, item_json, ignore_deleted=True) # path might change so we preemptively remove the old file
|
|
set_computed_fields_json(config, item_json) # recompute formatted datetime & path from the current configuration
|
|
# rewrite JSON
|
|
with json_path.open("w") as f:
|
|
json.dump(item_json, f)
|
|
# rewrite HTML
|
|
generate_html_for_item(config, item_json, regenerate=True)
|
|
|
|
#%% Run feather
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
prog="feather",
|
|
description="file-based RSS reader"
|
|
)
|
|
parser.add_argument(
|
|
"action", choices=("sync", "sync-up", "sync-down", "regenerate"),
|
|
help="sync: perform a full synchronization with the server; sync-up: only synchronize local changes to the server (e.g. items read locally); sync-down: only synchronize remote change from the server (e.g. new items or items read from another device); regenerate: regenerate all HTML files from the local data"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
config = Config()
|
|
if args.action == "sync":
|
|
client_session = ClientSession(config)
|
|
synchronize(config, client_session)
|
|
elif args.action == "sync-up":
|
|
client_session = ClientSession(config)
|
|
synchronize_local_changes(config, client_session)
|
|
elif args.action == "sync-down":
|
|
client_session = ClientSession(config)
|
|
synchronize_remote_changes(config, client_session)
|
|
elif args.action == "regenerate":
|
|
regenerate_files(config)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|