From 07e9d208b1d8a33f30e3ba4ced6652dbd6eb3151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Reuh=20Fildadut?= Date: Sat, 11 Oct 2025 14:35:14 +0200 Subject: [PATCH] feat: allow retrieving read articles --- README.md | 2 +- .../{feedreaderclient.py => client.py} | 75 +++++++++++++++++-- src/feather/config.default.toml | 44 +++++++++-- src/feather/config.py | 15 ++-- src/feather/{articledata.py => data.py} | 73 +++++------------- src/feather/feather.py | 12 +-- 6 files changed, 140 insertions(+), 81 deletions(-) rename src/feather/{feedreaderclient.py => client.py} (55%) rename src/feather/{articledata.py => data.py} (69%) diff --git a/README.md b/README.md index 2bb1d98..521cd8d 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,6 @@ You need Python 3.12 or newer. Then pip it up, as the kids say. - [ ] Use inotify for real-time article mark-as-read action - [ ] Share the fun somewhere - [ ] Actually think about the issues created by the duplicate warning -- [ ] Attachments +- [ ] Get article attachments - [ ] Test with FreshRSS diff --git a/src/feather/feedreaderclient.py b/src/feather/client.py similarity index 55% rename from src/feather/feedreaderclient.py rename to src/feather/client.py index 8f2958b..4a804e3 100644 --- a/src/feather/feedreaderclient.py +++ b/src/feather/client.py @@ -1,12 +1,14 @@ """Connection between the remote server and feather""" +from __future__ import annotations + import re from abc import ABC, abstractmethod from ttrss.client import TTRClient import google_reader from feather.config import Config -from feather.articledata import Article, GReaderArticle, TTRArticle, ArticleId, Category, CategoryId +from feather.data import Article, ArticleId, Category class ClientSession(ABC): config: Config @@ -22,7 +24,7 @@ class ClientSession(ABC): pass @abstractmethod - def get_unread_articles_in_category(self, category_id: CategoryId, limit: int, continuation: int=0) -> list[Article]: + def get_articles_in_category(self, category: Category, limit: int, continuation: int = 0, unread_only: bool = False) -> list[Article]: """Returns a list of Articles in the given category. limit and continuation are required for pagination.""" pass @@ -51,11 +53,37 @@ class GReaderSession(ClientSession): l.append(Category(id=category_id, title=category_name)) return l - def get_unread_articles_in_category(self, category, limit=500, continuation=0) -> list[GReaderArticle]: - items_ids = self.greader.get_stream_items_ids(self.auth_token, stream_id=category.id, exclude_target="user/-/state/com.google/read", limit=limit, continuation=continuation) + def get_articles_in_category(self, category: Category, limit: int = 1000, continuation: int = 0, unread_only: bool = False) -> list[GReaderArticle]: + items_ids = self.greader.get_stream_items_ids( + self.auth_token, + stream_id=category.id, + exclude_target="user/-/state/com.google/read" if unread_only else None, + limit=limit, + continuation=continuation, + ) item_contents = self.greader.get_stream_items_contents(self.auth_token, self.csrf_token, item_ids=[item.id for item in items_ids.item_refs]) return [ GReaderArticle(self, category, item_content) for item_content in item_contents.items ] +class GReaderArticle(Article): + def __init__(self, session: GReaderSession, category: Category, item_content): + self.config = session.config + + self.id = item_content.id + self.category = category + + self.unread = "user/-/state/com.google/read" not in item_content.categories + self.title = item_content.title + self.published = item_content.published + self.updated = item_content.updated + self.author = item_content.author + self.summary = item_content.summary.content + self.content = item_content.content.content + self.feed_title = item_content.origin.title + self.feed_url = item_content.origin.html_url + self.article_url = item_content.canonical[0].href + + self.compute_fields() + class TTRSession(ClientSession): """Tiny Tiny RSS API client""" ttrss: TTRClient @@ -93,6 +121,41 @@ class TTRSession(ClientSession): tree = self.ttrss.get_feed_tree() return get_categories_recursive(tree["categories"]) - def get_unread_articles_in_category(self, category, limit=100, continuation=0) -> list[TTRArticle]: - headlines = self.ttrss.get_headlines(feed_id=category.id, limit=limit, skip=continuation, is_cat=True, show_excerpt=True, show_content=True, view_mode="unread", include_attachments=False, include_nested=False) + def get_articles_in_category(self, category: Category, limit: int = 200, continuation: int = 0, unread_only: bool = False) -> list[TTRArticle]: + headlines = self.ttrss.get_headlines( + feed_id=category.id, + limit=limit, + skip=continuation, + is_cat=True, + show_excerpt=True, + show_content=True, + view_mode="unread" if unread_only else "all_articles", + include_attachments=False, + include_nested=False, + ) return [ TTRArticle(self, category, headline) for headline in headlines ] + +class TTRArticle(Article): + def __init__(self, session: TTRSession, category: Category, article): + self.config = session.config + + self.id = article.id + self.category = category + + self.unread = article.unread + self.title = article.title + self.published = article.updated.timestamp() + self.updated = article.updated.timestamp() + self.author = article.author + self.summary = article.excerpt + self.content = article.content + self.feed_title = article.feed_title + self.feed_url = article.site_url + self.feed_icon_url = session.feeds[article.feed_id]["icon"] + self.feed_order = session.feeds[article.feed_id]["order"] + self.article_url = article.link + self.comments_url = article.comments_link + self.language = article.lang + self.image_url = article.flavor_image + + self.compute_fields() diff --git a/src/feather/config.default.toml b/src/feather/config.default.toml index 62805f7..55aff63 100644 --- a/src/feather/config.default.toml +++ b/src/feather/config.default.toml @@ -1,3 +1,8 @@ +# Feather default configuration file. +# You can overwrite any of these values by either: +# - creating a config.toml file containing your user configuration. You can choose another filename by setting the CONFIG_PATH environment variable. +# - setting environment variables for the values you want to overwrite (the environment variable name for each value can be found in the comments below). + [server] # Server API to use. Either "googlereader" for the Google Reader API (FreshRSS, Miniflux, etc.) or "ttrss" for the TinyTiny-RSS API. # The Google Reader API do not support nested categories. @@ -12,11 +17,15 @@ user = "username" # Can be set through the environment variable SERVER_PASSWORD. password = "password" # How many items to retrieve at most from the server in a single request. Lower values will make synchronization slower, higher values might make the server complain. +# If you are missing articles after a sync, it might be because this value is too high. # If you are using the Google Reader API: servers should be okay with up to 1000. # If you are using the ttrss API: servers should be okay with up to 200. -# Set to 0 to let feather choose. -# Can be set through the environment variable SERVER_ITEMS_PER_REQUEST. -items_per_request = 0 +# Set to 0 to let feather choose (200 for ttrss, 1000 for googlereader). +# Can be set through the environment variable SERVER_ARTICLES_PER_REQUEST. +articles_per_request = 0 +# Set to true to only sync unread articles; feather will not retrieve or store any read article. +# Can be set through the environment variable SERVER_ONLY_SYNC_UNREAD_ARTICLES. +only_sync_unread_articles = true [directories] # Directory path where the internal feather data will be stored. @@ -27,9 +36,28 @@ data = "data" reader = "reader" [html] -# HTML template used for generating item HTML files. All templates are Jinja2 templates. +# HTML template used for generating article HTML files. All templates are Jinja2 templates. +# Available fields: +# - id: article id (int | str) +# - title: article title (str) +# - published: article publication time (timestamp) (int) +# - published_formatted: article publication time (text) (str) +# - updated: article update time (timestamp) (int) +# - updated_formatted: article publication time (text) (str) +# - author: article author (str) +# - summary: article summary (HTML) (str) +# - content: article content (HTML) (str) +# - feed_title: feed title (str) +# - feed_url: feed URL (str) +# - feed_icon_url: feed icon URL (str) +# - feed_order: feed display order, starting from 1 (0 if unknown) (int) +# - article_url: article URL (str) +# - comments_url: article comments URL (str) +# - language: article language (str) +# - image_url: article main image (str) +# - category: feed category (Category) # Can be set through the environment variable HTML_TEMPLATE. -template = ''' +article_template = ''' @@ -68,9 +96,15 @@ template = ''' ''' # Filename template for generated HTML files. +# The available fields are the same as for template. # Can be set through the environment variable HTML_FILENAME_TEMPLATE. filename_template = "[{{ feed_title }}]\t{{ title }} ({{ published_formatted }}).html" # Category directory name template for generated HTML files. +# Fields availables: +# - id: category id (str | int) +# - title: category name (str) +# - parents: list of parent categories (list[Category]) +# - order: category display order, starting from 1 (0 if unknown) (int) # Can be set through the environment variable HTML_CATEGORY_TEMPLATE. category_template = "{% if order %}{{ '%02d' % order }} {% endif %}{{ title }}" # Maximum allowed filename length (in bytes assuming UTF-8 encoding) before truncating. Depending on your filesystem filename's limits it may be possible to increase the value, ask Wikipedia for details. diff --git a/src/feather/config.py b/src/feather/config.py index 9add879..fd9d255 100644 --- a/src/feather/config.py +++ b/src/feather/config.py @@ -43,20 +43,21 @@ class Config: self.server_api: str = str(get_config("server", "api")) if self.server_api not in ("googlereader", "ttrss"): - raise ConfigurationError(f"server.api must be either ttrss or googlereader") + raise ConfigurationError(f"server.api must be either ttrss or googlereader, not {self.server_api}") self.server_url: str = str(get_config("server", "url", False)) self.server_user: str = str(get_config("server", "user", False)) self.server_password: str = str(get_config("server", "password", False)) - self.items_per_query: int = int(get_config("server", "items_per_request")) - if self.items_per_query == 0: - self.items_per_query = 1000 if self.server_api == "googlereader" else 200 + self.articles_per_query: int = int(get_config("server", "articles_per_request")) + if self.articles_per_query == 0: + self.articles_per_query = 1000 if self.server_api == "googlereader" else 200 + self.only_sync_unread_articles: bool = bool(get_config("server", "only_sync_unread_articles")) self.timezone: ZoneInfo = ZoneInfo(str(get_config("datetime", "timezone"))) self.time_format: str = str(get_config("datetime", "format")) - self.item_template: Template = Template(str(get_config("html", "template")), autoescape=True) - self.item_filename_template: Template = Template(str(get_config("html", "filename_template")), autoescape=False) - self.item_category_template: Template = Template(str(get_config("html", "category_template")), autoescape=False) + self.article_template: Template = Template(str(get_config("html", "article_template")), autoescape=True) + self.article_filename_template: Template = Template(str(get_config("html", "filename_template")), autoescape=False) + self.article_category_template: Template = Template(str(get_config("html", "category_template")), autoescape=False) self.max_filename_length: int = int(get_config("html", "max_filename_length")) self.filename_translation = str.maketrans(get_config("html", "filename_replacement")) diff --git a/src/feather/articledata.py b/src/feather/data.py similarity index 69% rename from src/feather/articledata.py rename to src/feather/data.py index 175e1e7..9e4cb2f 100644 --- a/src/feather/articledata.py +++ b/src/feather/data.py @@ -4,7 +4,7 @@ from __future__ import annotations import os import json -from abc import ABC, abstractmethod +from abc import ABC from datetime import datetime from pathlib import Path from hashlib import sha256 @@ -58,16 +58,22 @@ type ArticleId = int | str class Article(ABC): config: Config - json_path: Path - html_path: str + # fields serialized into the JSON file # + + # no default value id: ArticleId # article id + category: Category # feed category + # no default value, computed by compute_fields + published_formatted: str # article publication time (text) + updated_formatted: str # article publication time (text) + html_path: str # html path, relative to the html_root directory + # with default value + unread: bool = True # if the article is unread title: str = "" # article title published: int = 0 # article publication time (timestamp) - published_formatted: str # article publication time (text) updated: int = 0 # article update time (timestamp) - updated_formatted: str # article publication time (text) author: str = "" # article author summary: str = "" # article summary (HTML) content: str = "" # article content (HTML) @@ -79,16 +85,15 @@ class Article(ABC): comments_url: str = "" # article comments URL language: str = "" # article language image_url: str = "" # article main image - category: Category # feed category def get_html_path(self): config = self.config category_directory = config.html_root for category in self.category.parents: - category_directory /= escape_filename(config, config.item_category_template.render(category.asdict())) - category_directory /= escape_filename(config, config.item_category_template.render(self.category.asdict())) + category_directory /= escape_filename(config, config.article_category_template.render(category.asdict())) + category_directory /= escape_filename(config, config.article_category_template.render(self.category.asdict())) - html_name = truncate_filename(config, escape_filename(config, config.item_filename_template.render(self.get_template_dict()))) + html_name = truncate_filename(config, escape_filename(config, config.article_filename_template.render(self.get_template_dict()))) return category_directory / html_name @@ -100,13 +105,13 @@ class Article(ABC): self.html_path = str(self.get_html_path().relative_to(config.html_root)) # TODO: do this dynamically on write, handle overwrite conflict at the same time def get_template_dict(self) -> dict: - template_fields = ("id", "title", "published", "published_formatted", "updated", "updated_formatted", "author", "summary", "content", "feed_title", "feed_url", "feed_icon_url", "feed_order", "article_url", "comments_url", "language", "image_url") + template_fields = ("id", "unread", "title", "published", "published_formatted", "updated", "updated_formatted", "author", "summary", "content", "feed_title", "feed_url", "feed_icon_url", "feed_order", "article_url", "comments_url", "language", "image_url") d = { field: getattr(self, field) for field in template_fields } d["category"] = self.category.asdict() return d def write_json(self): - stored_fields = ("id", "title", "published", "published_formatted", "updated", "updated_formatted", "author", "summary", "content", "feed_title", "feed_url", "feed_icon_url", "feed_order", "article_url", "comments_url", "language", "image_url", "html_path") + stored_fields = ("id", "unread", "title", "published", "published_formatted", "updated", "updated_formatted", "author", "summary", "content", "feed_title", "feed_url", "feed_icon_url", "feed_order", "article_url", "comments_url", "language", "image_url", "html_path") item_json = { field: getattr(self, field) for field in stored_fields } item_json["category"] = self.category.asdict() if self.json_path.exists(): @@ -125,7 +130,7 @@ class Article(ABC): else: html_path.parent.mkdir(parents=True, exist_ok=True) with html_path.open("w") as f: - f.write(config.item_template.render(self.get_template_dict())) + f.write(config.article_template.render(self.get_template_dict())) # set accessed date to update time, modified to publication time os.utime(html_path, (max(self.updated, self.updated), self.published)) def delete_html(self, ignore_deleted=False): @@ -145,53 +150,9 @@ class Article(ABC): self.compute_fields() # recompute formatted datetime & paths from the current configuration self.write() # rewrite JSON & HTML -class GReaderArticle(Article): - def __init__(self, session: GReaderSession, category: Category, item_content): - self.config = session.config - - self.category = category - - self.id = item_content.id - self.title = item_content.title - self.published = item_content.published - self.updated = item_content.updated - self.author = item_content.author - self.summary = item_content.summary.content - self.content = item_content.content.content - self.feed_title = item_content.origin.title - self.feed_url = item_content.origin.html_url - self.article_url = item_content.canonical[0].href - - self.compute_fields() - -class TTRArticle(Article): - def __init__(self, session: TRRSession, category: Category, article): - self.config = session.config - - self.category = category - - self.id = article.id - self.title = article.title - self.published = article.updated.timestamp() - self.updated = article.updated.timestamp() - self.author = article.author - self.summary = article.excerpt - self.content = article.content - self.feed_title = article.feed_title - self.feed_url = article.site_url - self.feed_icon_url = session.feeds[article.feed_id]["icon"] - self.feed_order = session.feeds[article.feed_id]["order"] - self.article_url = article.link - self.comments_url = article.comments_link - self.language = article.lang - self.image_url = article.flavor_image - - self.compute_fields() - class FileArticle(Article): def __init__(self, config: Config, json_path: Path) -> Article: self.config = config - self.json_path = json_path item_json = json.load(json_path.open("r")) diff --git a/src/feather/feather.py b/src/feather/feather.py index 0b378ae..37f8edf 100755 --- a/src/feather/feather.py +++ b/src/feather/feather.py @@ -4,8 +4,8 @@ import asyncio import signal from feather.config import Config -from feather.feedreaderclient import GReaderSession, TTRSession, ClientSession -from feather.articledata import FileArticle +from feather.client import GReaderSession, TTRSession, ClientSession +from feather.data import FileArticle class FeatherApp: config: Config @@ -64,8 +64,8 @@ class FeatherApp: article.delete() marked_as_read += 1 - for i in range(0, len(to_mark_as_read), config.items_per_query): - client_session.mark_as_read(to_mark_as_read[i:i+config.items_per_query]) + for i in range(0, len(to_mark_as_read), config.articles_per_query): + client_session.mark_as_read(to_mark_as_read[i:i+config.articles_per_query]) print(f"Marked {marked_as_read} items as read") @@ -86,8 +86,8 @@ class FeatherApp: remaining, continuation = True, 0 while remaining: - articles = client_session.get_unread_articles_in_category(category, limit=config.items_per_query, continuation=continuation) - if len(articles) >= config.items_per_query: + articles = client_session.get_articles_in_category(category, limit=config.articles_per_query, continuation=continuation, unread_only=config.only_sync_unread_articles) + if len(articles) >= config.articles_per_query: continuation += len(articles) else: remaining = False