feat: make sure on-disk data is always in a consistent state during writes

2025-10-27 10:09:32 +00:00 · 2025-10-12 15:03:34 +02:00 · 2025-10-12 15:03:34 +02:00 · 59e7530784
commit 59e7530784
parent 66e7aa0eb4
5 changed files with 69 additions and 58 deletions
--- a/README.md
+++ b/README.md
@ -70,7 +70,6 @@ before publishing:
 - [ ] Write documentation
 - [ ] Share the fun somewhere
 - [ ] Test with FreshRSS
- [ ] Check locks & errors while writing to db (if JSON file corrupted)
 - [ ] Feed & category order for googlereader ??

 "i'll look into it later":
--- a/src/feather/client.py
+++ b/src/feather/client.py
@ -128,7 +128,7 @@ class GReaderArticle(Article):
        self.feed_url = item_content.origin.html_url
        self.article_url = item_content.canonical[0].href

-        self.compute_json_path()
+        self._compute_json_path()


 class TTRSession(ClientSession):
@ -227,4 +227,4 @@ class TTRArticle(Article):
        self.language = article.lang
        self.image_url = article.flavor_image

-        self.compute_json_path()
+        self._compute_json_path()
--- a/src/feather/config.py
+++ b/src/feather/config.py
@ -85,9 +85,6 @@ class Config:
        self.daemon_sync_up_every: int = int(get_config("daemon", "sync_up_every"))
        self.daemon_sync_down_every: int = int(get_config("daemon", "sync_down_every"))

-        # Computed config fields
-        self.update_lock: Path = self.json_root / "update.lock"
-
        # Create missing directories
        self.html_root.mkdir(exist_ok=True)
        self.json_root.mkdir(exist_ok=True)
--- a/src/feather/data.py
+++ b/src/feather/data.py
@ -8,6 +8,7 @@ from abc import ABC
 from datetime import datetime
 from pathlib import Path
 from hashlib import sha256
+from tempfile import NamedTemporaryFile

 from feather.config import Config

@ -17,7 +18,7 @@ def sanitize_filename(
 ) -> str:
    """Escape invalid caracters and truncate the filename as per the configuration.
    This operates on a single filename, not a path.
-    (insert_before_suffix will be inserted between the stem and suffix, and is assumed to not need escaping)"""
+    (insert_before_suffix will be inserted between the stem and suffix, and is assumed to not need escaping)."""
    filename = filename.translate(config.filename_translation)

    max_filename_length = config.max_filename_length
@ -38,12 +39,24 @@ def sanitize_filename(


 def format_datetime(config: Config, timestamp: int) -> str:
-    """Format a timestamp according to the configuraiton"""
+    """Format a timestamp according to the configuraiton."""
    return datetime.fromtimestamp(timestamp, config.timezone).strftime(
        config.time_format
    )


+def atomic_write(path: Path, content: str):
+    """Write data to a file atomically."""
+    try:
+        with NamedTemporaryFile(dir=path.parent, delete=False) as f:
+            path_temp = Path(f.name)
+            f.write(content.encode("utf-8"))
+        path_temp.rename(path)
+    except:
+        path_temp.unlink(missing_ok=True)
+        raise
+
+
 type CategoryId = int | str


@ -100,6 +113,9 @@ class Article(ABC):
    language: str = ""  # article language
    image_url: str = ""  # article main image

+    def _hash_id(self):
+        return sha256(str(self.id).encode("utf-8")).hexdigest()
+
    def _get_html_path(self) -> Path:
        config = self.config

@ -121,7 +137,7 @@ class Article(ABC):
        html_path = category_directory / html_name
        # Add hash if filename already exists
        if html_path.exists():
-            id_hash = sha256(str(self.id).encode("utf-8")).hexdigest()[:8]
+            id_hash = self._hash_id()[:8]
            html_path = html_path.parent / sanitize_filename(
                config, html_path.name, insert_before_suffix=f".{id_hash}"
            )
@ -152,16 +168,13 @@ class Article(ABC):
        d["category"] = self.category.asdict()
        return d

-    def compute_json_path(self):
-        self.json_path = (
-            self.config.json_root
-            / f"{sha256(str(self.id).encode('utf-8')).hexdigest()}.json"
-        )
+    def _compute_json_path(self):
+        self.json_path = self.config.json_root / f"{self._hash_id()}.json"

-    def write_json(self, recompute_path=False):
+    def _write_json(self, recompute_path=False):
        """Write the JSON file associated with this article. Error if it already exists."""
        if recompute_path:
-            self.compute_json_path()
+            self._compute_json_path()
        stored_fields = (
            "id",
            "unread",
@ -184,25 +197,16 @@ class Article(ABC):
        article_json = {field: getattr(self, field) for field in stored_fields}
        article_json["category"] = self.category.asdict()
        if self.json_path.exists():
-            raise Exception(
-                f"Unexpectedly tried to overwrite article data for {self.json_path}"
+            raise FileExistsError(
+                f"Unexpectedly tried to overwrite article data for {self.json_path}: either the feed has duplicate entries, or another feather process tried to write the same file at the same time (in which case, try again later)"
            )
-        with self.json_path.open("w") as f:
-            json.dump(article_json, f)
+        atomic_write(self.json_path, json.dumps(article_json))

-    def delete_json(self):
+    def _delete_json(self, missing_ok=False):
        """Delete the JSON file associated with this article."""
-        self.json_path.unlink()
+        self.json_path.unlink(missing_ok=missing_ok)

-    def has_html(self) -> bool:
-        """Check if the HTML file associated with the article exists on disk."""
-        if self.html_path is None:
-            return False
-        else:
-            html_path = self.config.html_root / self.html_path
-            return html_path.exists()
-
-    def write_html(self, recompute_path=False):
+    def _write_html(self, recompute_path=False):
        """Write the HTML file associated with this article. Error if it already exists.
        Note: this may recompute html_path, which is saved into the JSON - so make sure to save the JSON file _after_ the HTML file."""
        # Write HTML file for a JSON object
@ -212,43 +216,62 @@ class Article(ABC):
            self.html_path = str(self._get_html_path())

        html_path = config.html_root / self.html_path
+        html_path.parent.mkdir(parents=True, exist_ok=True)
        if html_path.exists():
-            raise Exception(
-                f"Unexpectedly tried to overwrite article file for {html_path}. Either the feed has duplicate entries, or something has gone terribly wrong."
+            raise FileExistsError(
+                f"Unexpectedly tried to overwrite article file for {html_path}: either the feed has duplicate entries, or another feather process tried to write the same file at the same time (in which case, try again later)"
            )
-        else:
-            html_path.parent.mkdir(parents=True, exist_ok=True)
-            with html_path.open("w") as f:
-                f.write(config.article_template.render(self._get_template_dict()))
-            # set accessed date to update time, modified to publication time
-            os.utime(html_path, (max(self.updated, self.updated), self.published))
+        atomic_write(
+            html_path, config.article_template.render(self._get_template_dict())
+        )
+        # set accessed date to update time, modified to publication time
+        os.utime(html_path, (max(self.updated, self.updated), self.published))

-    def delete_html(self, ignore_deleted=False):
+    def _delete_html(self, missing_ok=False):
        """Delete the HTML file associated with this article."""
        # Delete a HTML file for a JSON object
        html_path = self.config.html_root / self.html_path
-        if not ignore_deleted or html_path.exists():
-            html_path.unlink()
+        html_path.unlink(missing_ok=missing_ok)
+
+    def has_html(self) -> bool:
+        """Check if the HTML file associated with the article exists on disk."""
+        if self.html_path is None:
+            return False
+        else:
+            html_path = self.config.html_root / self.html_path
+            return html_path.exists()
+
+    def was_updated(self, old_article: Article) -> bool:
+        """Returns true if the article is different from a previous version in a way that would require regeneration"""
+        return old_article._get_template_dict() != self._get_template_dict()

    def write(self, recompute_paths=False):
        """Write all the files associated with this article to disk."""
-        self.write_html(recompute_path=recompute_paths)
-        self.write_json(recompute_path=recompute_paths)
+        try:
+            self._write_html(recompute_path=recompute_paths)
+        except FileExistsError:
+            raise
+        except:
+            self._delete_html(missing_ok=True)
+            raise
+        try:
+            self._write_json(recompute_path=recompute_paths)
+        except FileExistsError:
+            raise
+        except:
+            self._delete_json(missing_ok=True)
+            raise

    def delete(self):
        """Delete all the files associated with this article."""
-        self.delete_html(ignore_deleted=True)
-        self.delete_json()
+        self._delete_html(missing_ok=True)
+        self._delete_json()

    def regenerate(self):
        """Delete and rewrite all the files associated with this article using to the latest configuration."""
        self.delete()  # paths might change so we preemptively remove the old file
        self.write(recompute_paths=True)  # rewrite JSON & HTML

-    def was_updated(self, old_article: Article) -> bool:
-        """Returns true if the article is different from a previous version in a way that would require regeneration"""
-        return old_article._get_template_dict() != self._get_template_dict()
-

 class FileArticle(Article):
    def __init__(self, config: Config, json_path: Path) -> Article:
--- a/src/feather/feather.py
+++ b/src/feather/feather.py
@ -59,12 +59,6 @@ class FeatherApp:
        config = self.config
        client_session = self.get_client_session()

-        if config.update_lock.exists():
-            print(
-                "The previous synchronization was aborted, not marking any article as read/unread in order to avoid collateral damage"
-            )
-            return
-
        # gather articles to mark as read/unread
        marked_as_read, marked_as_unread = 0, 0
        to_mark_as_read = []
@ -108,7 +102,6 @@ class FeatherApp:
        config = self.config
        client_session = self.get_client_session()

-        config.update_lock.touch()
        print("Synchronizing with server...")

        new_articles, updated_articles = 0, 0
@ -154,7 +147,6 @@ class FeatherApp:
        print(
            f"Synchronization successful ({new_articles} new articles, {updated_articles} updated, {removed_articles} removed)"
        )
-        config.update_lock.unlink()

    def synchronize(self):
        """Do a full feather update"""