From 59e7530784fc63714334f945ac8ad50fbd59400f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Reuh=20Fildadut?= Date: Sun, 12 Oct 2025 15:03:34 +0200 Subject: [PATCH] feat: make sure on-disk data is always in a consistent state during writes --- README.md | 1 - src/feather/client.py | 4 +- src/feather/config.py | 3 -- src/feather/data.py | 111 +++++++++++++++++++++++++---------------- src/feather/feather.py | 8 --- 5 files changed, 69 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 2dae6f1..7e5eb78 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,6 @@ before publishing: - [ ] Write documentation - [ ] Share the fun somewhere - [ ] Test with FreshRSS -- [ ] Check locks & errors while writing to db (if JSON file corrupted) - [ ] Feed & category order for googlereader ?? "i'll look into it later": diff --git a/src/feather/client.py b/src/feather/client.py index 7b45275..f340d78 100644 --- a/src/feather/client.py +++ b/src/feather/client.py @@ -128,7 +128,7 @@ class GReaderArticle(Article): self.feed_url = item_content.origin.html_url self.article_url = item_content.canonical[0].href - self.compute_json_path() + self._compute_json_path() class TTRSession(ClientSession): @@ -227,4 +227,4 @@ class TTRArticle(Article): self.language = article.lang self.image_url = article.flavor_image - self.compute_json_path() + self._compute_json_path() diff --git a/src/feather/config.py b/src/feather/config.py index fdda69d..e1643bd 100644 --- a/src/feather/config.py +++ b/src/feather/config.py @@ -85,9 +85,6 @@ class Config: self.daemon_sync_up_every: int = int(get_config("daemon", "sync_up_every")) self.daemon_sync_down_every: int = int(get_config("daemon", "sync_down_every")) - # Computed config fields - self.update_lock: Path = self.json_root / "update.lock" - # Create missing directories self.html_root.mkdir(exist_ok=True) self.json_root.mkdir(exist_ok=True) diff --git a/src/feather/data.py b/src/feather/data.py index 91948a1..d0f7062 100644 --- a/src/feather/data.py +++ b/src/feather/data.py @@ -8,6 +8,7 @@ from abc import ABC from datetime import datetime from pathlib import Path from hashlib import sha256 +from tempfile import NamedTemporaryFile from feather.config import Config @@ -17,7 +18,7 @@ def sanitize_filename( ) -> str: """Escape invalid caracters and truncate the filename as per the configuration. This operates on a single filename, not a path. - (insert_before_suffix will be inserted between the stem and suffix, and is assumed to not need escaping)""" + (insert_before_suffix will be inserted between the stem and suffix, and is assumed to not need escaping).""" filename = filename.translate(config.filename_translation) max_filename_length = config.max_filename_length @@ -38,12 +39,24 @@ def sanitize_filename( def format_datetime(config: Config, timestamp: int) -> str: - """Format a timestamp according to the configuraiton""" + """Format a timestamp according to the configuraiton.""" return datetime.fromtimestamp(timestamp, config.timezone).strftime( config.time_format ) +def atomic_write(path: Path, content: str): + """Write data to a file atomically.""" + try: + with NamedTemporaryFile(dir=path.parent, delete=False) as f: + path_temp = Path(f.name) + f.write(content.encode("utf-8")) + path_temp.rename(path) + except: + path_temp.unlink(missing_ok=True) + raise + + type CategoryId = int | str @@ -100,6 +113,9 @@ class Article(ABC): language: str = "" # article language image_url: str = "" # article main image + def _hash_id(self): + return sha256(str(self.id).encode("utf-8")).hexdigest() + def _get_html_path(self) -> Path: config = self.config @@ -121,7 +137,7 @@ class Article(ABC): html_path = category_directory / html_name # Add hash if filename already exists if html_path.exists(): - id_hash = sha256(str(self.id).encode("utf-8")).hexdigest()[:8] + id_hash = self._hash_id()[:8] html_path = html_path.parent / sanitize_filename( config, html_path.name, insert_before_suffix=f".{id_hash}" ) @@ -152,16 +168,13 @@ class Article(ABC): d["category"] = self.category.asdict() return d - def compute_json_path(self): - self.json_path = ( - self.config.json_root - / f"{sha256(str(self.id).encode('utf-8')).hexdigest()}.json" - ) + def _compute_json_path(self): + self.json_path = self.config.json_root / f"{self._hash_id()}.json" - def write_json(self, recompute_path=False): + def _write_json(self, recompute_path=False): """Write the JSON file associated with this article. Error if it already exists.""" if recompute_path: - self.compute_json_path() + self._compute_json_path() stored_fields = ( "id", "unread", @@ -184,25 +197,16 @@ class Article(ABC): article_json = {field: getattr(self, field) for field in stored_fields} article_json["category"] = self.category.asdict() if self.json_path.exists(): - raise Exception( - f"Unexpectedly tried to overwrite article data for {self.json_path}" + raise FileExistsError( + f"Unexpectedly tried to overwrite article data for {self.json_path}: either the feed has duplicate entries, or another feather process tried to write the same file at the same time (in which case, try again later)" ) - with self.json_path.open("w") as f: - json.dump(article_json, f) + atomic_write(self.json_path, json.dumps(article_json)) - def delete_json(self): + def _delete_json(self, missing_ok=False): """Delete the JSON file associated with this article.""" - self.json_path.unlink() + self.json_path.unlink(missing_ok=missing_ok) - def has_html(self) -> bool: - """Check if the HTML file associated with the article exists on disk.""" - if self.html_path is None: - return False - else: - html_path = self.config.html_root / self.html_path - return html_path.exists() - - def write_html(self, recompute_path=False): + def _write_html(self, recompute_path=False): """Write the HTML file associated with this article. Error if it already exists. Note: this may recompute html_path, which is saved into the JSON - so make sure to save the JSON file _after_ the HTML file.""" # Write HTML file for a JSON object @@ -212,43 +216,62 @@ class Article(ABC): self.html_path = str(self._get_html_path()) html_path = config.html_root / self.html_path + html_path.parent.mkdir(parents=True, exist_ok=True) if html_path.exists(): - raise Exception( - f"Unexpectedly tried to overwrite article file for {html_path}. Either the feed has duplicate entries, or something has gone terribly wrong." + raise FileExistsError( + f"Unexpectedly tried to overwrite article file for {html_path}: either the feed has duplicate entries, or another feather process tried to write the same file at the same time (in which case, try again later)" ) - else: - html_path.parent.mkdir(parents=True, exist_ok=True) - with html_path.open("w") as f: - f.write(config.article_template.render(self._get_template_dict())) - # set accessed date to update time, modified to publication time - os.utime(html_path, (max(self.updated, self.updated), self.published)) + atomic_write( + html_path, config.article_template.render(self._get_template_dict()) + ) + # set accessed date to update time, modified to publication time + os.utime(html_path, (max(self.updated, self.updated), self.published)) - def delete_html(self, ignore_deleted=False): + def _delete_html(self, missing_ok=False): """Delete the HTML file associated with this article.""" # Delete a HTML file for a JSON object html_path = self.config.html_root / self.html_path - if not ignore_deleted or html_path.exists(): - html_path.unlink() + html_path.unlink(missing_ok=missing_ok) + + def has_html(self) -> bool: + """Check if the HTML file associated with the article exists on disk.""" + if self.html_path is None: + return False + else: + html_path = self.config.html_root / self.html_path + return html_path.exists() + + def was_updated(self, old_article: Article) -> bool: + """Returns true if the article is different from a previous version in a way that would require regeneration""" + return old_article._get_template_dict() != self._get_template_dict() def write(self, recompute_paths=False): """Write all the files associated with this article to disk.""" - self.write_html(recompute_path=recompute_paths) - self.write_json(recompute_path=recompute_paths) + try: + self._write_html(recompute_path=recompute_paths) + except FileExistsError: + raise + except: + self._delete_html(missing_ok=True) + raise + try: + self._write_json(recompute_path=recompute_paths) + except FileExistsError: + raise + except: + self._delete_json(missing_ok=True) + raise def delete(self): """Delete all the files associated with this article.""" - self.delete_html(ignore_deleted=True) - self.delete_json() + self._delete_html(missing_ok=True) + self._delete_json() def regenerate(self): """Delete and rewrite all the files associated with this article using to the latest configuration.""" self.delete() # paths might change so we preemptively remove the old file self.write(recompute_paths=True) # rewrite JSON & HTML - def was_updated(self, old_article: Article) -> bool: - """Returns true if the article is different from a previous version in a way that would require regeneration""" - return old_article._get_template_dict() != self._get_template_dict() - class FileArticle(Article): def __init__(self, config: Config, json_path: Path) -> Article: diff --git a/src/feather/feather.py b/src/feather/feather.py index aa299ed..7859e63 100755 --- a/src/feather/feather.py +++ b/src/feather/feather.py @@ -59,12 +59,6 @@ class FeatherApp: config = self.config client_session = self.get_client_session() - if config.update_lock.exists(): - print( - "The previous synchronization was aborted, not marking any article as read/unread in order to avoid collateral damage" - ) - return - # gather articles to mark as read/unread marked_as_read, marked_as_unread = 0, 0 to_mark_as_read = [] @@ -108,7 +102,6 @@ class FeatherApp: config = self.config client_session = self.get_client_session() - config.update_lock.touch() print("Synchronizing with server...") new_articles, updated_articles = 0, 0 @@ -154,7 +147,6 @@ class FeatherApp: print( f"Synchronization successful ({new_articles} new articles, {updated_articles} updated, {removed_articles} removed)" ) - config.update_lock.unlink() def synchronize(self): """Do a full feather update"""