1
0
Fork 0
mirror of https://codeberg.org/Reuh/feather.git synced 2025-10-27 10:09:32 +00:00

feat: make sure on-disk data is always in a consistent state during writes

This commit is contained in:
Étienne Fildadut 2025-10-12 15:03:34 +02:00
parent 66e7aa0eb4
commit 59e7530784
5 changed files with 69 additions and 58 deletions

View file

@ -70,7 +70,6 @@ before publishing:
- [ ] Write documentation - [ ] Write documentation
- [ ] Share the fun somewhere - [ ] Share the fun somewhere
- [ ] Test with FreshRSS - [ ] Test with FreshRSS
- [ ] Check locks & errors while writing to db (if JSON file corrupted)
- [ ] Feed & category order for googlereader ?? - [ ] Feed & category order for googlereader ??
"i'll look into it later": "i'll look into it later":

View file

@ -128,7 +128,7 @@ class GReaderArticle(Article):
self.feed_url = item_content.origin.html_url self.feed_url = item_content.origin.html_url
self.article_url = item_content.canonical[0].href self.article_url = item_content.canonical[0].href
self.compute_json_path() self._compute_json_path()
class TTRSession(ClientSession): class TTRSession(ClientSession):
@ -227,4 +227,4 @@ class TTRArticle(Article):
self.language = article.lang self.language = article.lang
self.image_url = article.flavor_image self.image_url = article.flavor_image
self.compute_json_path() self._compute_json_path()

View file

@ -85,9 +85,6 @@ class Config:
self.daemon_sync_up_every: int = int(get_config("daemon", "sync_up_every")) self.daemon_sync_up_every: int = int(get_config("daemon", "sync_up_every"))
self.daemon_sync_down_every: int = int(get_config("daemon", "sync_down_every")) self.daemon_sync_down_every: int = int(get_config("daemon", "sync_down_every"))
# Computed config fields
self.update_lock: Path = self.json_root / "update.lock"
# Create missing directories # Create missing directories
self.html_root.mkdir(exist_ok=True) self.html_root.mkdir(exist_ok=True)
self.json_root.mkdir(exist_ok=True) self.json_root.mkdir(exist_ok=True)

View file

@ -8,6 +8,7 @@ from abc import ABC
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from hashlib import sha256 from hashlib import sha256
from tempfile import NamedTemporaryFile
from feather.config import Config from feather.config import Config
@ -17,7 +18,7 @@ def sanitize_filename(
) -> str: ) -> str:
"""Escape invalid caracters and truncate the filename as per the configuration. """Escape invalid caracters and truncate the filename as per the configuration.
This operates on a single filename, not a path. This operates on a single filename, not a path.
(insert_before_suffix will be inserted between the stem and suffix, and is assumed to not need escaping)""" (insert_before_suffix will be inserted between the stem and suffix, and is assumed to not need escaping)."""
filename = filename.translate(config.filename_translation) filename = filename.translate(config.filename_translation)
max_filename_length = config.max_filename_length max_filename_length = config.max_filename_length
@ -38,12 +39,24 @@ def sanitize_filename(
def format_datetime(config: Config, timestamp: int) -> str: def format_datetime(config: Config, timestamp: int) -> str:
"""Format a timestamp according to the configuraiton""" """Format a timestamp according to the configuraiton."""
return datetime.fromtimestamp(timestamp, config.timezone).strftime( return datetime.fromtimestamp(timestamp, config.timezone).strftime(
config.time_format config.time_format
) )
def atomic_write(path: Path, content: str):
"""Write data to a file atomically."""
try:
with NamedTemporaryFile(dir=path.parent, delete=False) as f:
path_temp = Path(f.name)
f.write(content.encode("utf-8"))
path_temp.rename(path)
except:
path_temp.unlink(missing_ok=True)
raise
type CategoryId = int | str type CategoryId = int | str
@ -100,6 +113,9 @@ class Article(ABC):
language: str = "" # article language language: str = "" # article language
image_url: str = "" # article main image image_url: str = "" # article main image
def _hash_id(self):
return sha256(str(self.id).encode("utf-8")).hexdigest()
def _get_html_path(self) -> Path: def _get_html_path(self) -> Path:
config = self.config config = self.config
@ -121,7 +137,7 @@ class Article(ABC):
html_path = category_directory / html_name html_path = category_directory / html_name
# Add hash if filename already exists # Add hash if filename already exists
if html_path.exists(): if html_path.exists():
id_hash = sha256(str(self.id).encode("utf-8")).hexdigest()[:8] id_hash = self._hash_id()[:8]
html_path = html_path.parent / sanitize_filename( html_path = html_path.parent / sanitize_filename(
config, html_path.name, insert_before_suffix=f".{id_hash}" config, html_path.name, insert_before_suffix=f".{id_hash}"
) )
@ -152,16 +168,13 @@ class Article(ABC):
d["category"] = self.category.asdict() d["category"] = self.category.asdict()
return d return d
def compute_json_path(self): def _compute_json_path(self):
self.json_path = ( self.json_path = self.config.json_root / f"{self._hash_id()}.json"
self.config.json_root
/ f"{sha256(str(self.id).encode('utf-8')).hexdigest()}.json"
)
def write_json(self, recompute_path=False): def _write_json(self, recompute_path=False):
"""Write the JSON file associated with this article. Error if it already exists.""" """Write the JSON file associated with this article. Error if it already exists."""
if recompute_path: if recompute_path:
self.compute_json_path() self._compute_json_path()
stored_fields = ( stored_fields = (
"id", "id",
"unread", "unread",
@ -184,25 +197,16 @@ class Article(ABC):
article_json = {field: getattr(self, field) for field in stored_fields} article_json = {field: getattr(self, field) for field in stored_fields}
article_json["category"] = self.category.asdict() article_json["category"] = self.category.asdict()
if self.json_path.exists(): if self.json_path.exists():
raise Exception( raise FileExistsError(
f"Unexpectedly tried to overwrite article data for {self.json_path}" f"Unexpectedly tried to overwrite article data for {self.json_path}: either the feed has duplicate entries, or another feather process tried to write the same file at the same time (in which case, try again later)"
) )
with self.json_path.open("w") as f: atomic_write(self.json_path, json.dumps(article_json))
json.dump(article_json, f)
def delete_json(self): def _delete_json(self, missing_ok=False):
"""Delete the JSON file associated with this article.""" """Delete the JSON file associated with this article."""
self.json_path.unlink() self.json_path.unlink(missing_ok=missing_ok)
def has_html(self) -> bool: def _write_html(self, recompute_path=False):
"""Check if the HTML file associated with the article exists on disk."""
if self.html_path is None:
return False
else:
html_path = self.config.html_root / self.html_path
return html_path.exists()
def write_html(self, recompute_path=False):
"""Write the HTML file associated with this article. Error if it already exists. """Write the HTML file associated with this article. Error if it already exists.
Note: this may recompute html_path, which is saved into the JSON - so make sure to save the JSON file _after_ the HTML file.""" Note: this may recompute html_path, which is saved into the JSON - so make sure to save the JSON file _after_ the HTML file."""
# Write HTML file for a JSON object # Write HTML file for a JSON object
@ -212,43 +216,62 @@ class Article(ABC):
self.html_path = str(self._get_html_path()) self.html_path = str(self._get_html_path())
html_path = config.html_root / self.html_path html_path = config.html_root / self.html_path
html_path.parent.mkdir(parents=True, exist_ok=True)
if html_path.exists(): if html_path.exists():
raise Exception( raise FileExistsError(
f"Unexpectedly tried to overwrite article file for {html_path}. Either the feed has duplicate entries, or something has gone terribly wrong." f"Unexpectedly tried to overwrite article file for {html_path}: either the feed has duplicate entries, or another feather process tried to write the same file at the same time (in which case, try again later)"
) )
else: atomic_write(
html_path.parent.mkdir(parents=True, exist_ok=True) html_path, config.article_template.render(self._get_template_dict())
with html_path.open("w") as f: )
f.write(config.article_template.render(self._get_template_dict())) # set accessed date to update time, modified to publication time
# set accessed date to update time, modified to publication time os.utime(html_path, (max(self.updated, self.updated), self.published))
os.utime(html_path, (max(self.updated, self.updated), self.published))
def delete_html(self, ignore_deleted=False): def _delete_html(self, missing_ok=False):
"""Delete the HTML file associated with this article.""" """Delete the HTML file associated with this article."""
# Delete a HTML file for a JSON object # Delete a HTML file for a JSON object
html_path = self.config.html_root / self.html_path html_path = self.config.html_root / self.html_path
if not ignore_deleted or html_path.exists(): html_path.unlink(missing_ok=missing_ok)
html_path.unlink()
def has_html(self) -> bool:
"""Check if the HTML file associated with the article exists on disk."""
if self.html_path is None:
return False
else:
html_path = self.config.html_root / self.html_path
return html_path.exists()
def was_updated(self, old_article: Article) -> bool:
"""Returns true if the article is different from a previous version in a way that would require regeneration"""
return old_article._get_template_dict() != self._get_template_dict()
def write(self, recompute_paths=False): def write(self, recompute_paths=False):
"""Write all the files associated with this article to disk.""" """Write all the files associated with this article to disk."""
self.write_html(recompute_path=recompute_paths) try:
self.write_json(recompute_path=recompute_paths) self._write_html(recompute_path=recompute_paths)
except FileExistsError:
raise
except:
self._delete_html(missing_ok=True)
raise
try:
self._write_json(recompute_path=recompute_paths)
except FileExistsError:
raise
except:
self._delete_json(missing_ok=True)
raise
def delete(self): def delete(self):
"""Delete all the files associated with this article.""" """Delete all the files associated with this article."""
self.delete_html(ignore_deleted=True) self._delete_html(missing_ok=True)
self.delete_json() self._delete_json()
def regenerate(self): def regenerate(self):
"""Delete and rewrite all the files associated with this article using to the latest configuration.""" """Delete and rewrite all the files associated with this article using to the latest configuration."""
self.delete() # paths might change so we preemptively remove the old file self.delete() # paths might change so we preemptively remove the old file
self.write(recompute_paths=True) # rewrite JSON & HTML self.write(recompute_paths=True) # rewrite JSON & HTML
def was_updated(self, old_article: Article) -> bool:
"""Returns true if the article is different from a previous version in a way that would require regeneration"""
return old_article._get_template_dict() != self._get_template_dict()
class FileArticle(Article): class FileArticle(Article):
def __init__(self, config: Config, json_path: Path) -> Article: def __init__(self, config: Config, json_path: Path) -> Article:

View file

@ -59,12 +59,6 @@ class FeatherApp:
config = self.config config = self.config
client_session = self.get_client_session() client_session = self.get_client_session()
if config.update_lock.exists():
print(
"The previous synchronization was aborted, not marking any article as read/unread in order to avoid collateral damage"
)
return
# gather articles to mark as read/unread # gather articles to mark as read/unread
marked_as_read, marked_as_unread = 0, 0 marked_as_read, marked_as_unread = 0, 0
to_mark_as_read = [] to_mark_as_read = []
@ -108,7 +102,6 @@ class FeatherApp:
config = self.config config = self.config
client_session = self.get_client_session() client_session = self.get_client_session()
config.update_lock.touch()
print("Synchronizing with server...") print("Synchronizing with server...")
new_articles, updated_articles = 0, 0 new_articles, updated_articles = 0, 0
@ -154,7 +147,6 @@ class FeatherApp:
print( print(
f"Synchronization successful ({new_articles} new articles, {updated_articles} updated, {removed_articles} removed)" f"Synchronization successful ({new_articles} new articles, {updated_articles} updated, {removed_articles} removed)"
) )
config.update_lock.unlink()
def synchronize(self): def synchronize(self):
"""Do a full feather update""" """Do a full feather update"""