app/services/metadata_processor.py

import os
import markdown
import json
from typing import List, Dict


class MetadataProcessor:
    """
    A class to scan Markdown files, extract front matter metadata,
    and generate a structured JSON file.
    """

    def __init__(self, input_dir: str, output_file: str):
        """
        Initialize the MetadataProcessor.

        Args:
            input_dir (str): Directory containing Markdown files.
            output_file (str): Path to save the generated JSON file.
        """
        self.input_dir = input_dir
        self.output_file = output_file
        self.data = {"categories": [], "favorites": []}

    def _extract_metadata(self, file_path: str) -> Dict:
        """
        Extract front matter metadata using the 'markdown' package.

        Args:
            file_path (str): Path to the Markdown file.

        Returns:
            dict: A dictionary containing the extracted metadata.
        """
        with open(file_path, "r", encoding="utf-8") as file:
            markdown_content = file.read()

            # Initialize Markdown with meta extension
            md = markdown.Markdown(extensions=["extra", "nl2br", "meta"])
            md.convert(markdown_content)

            # Metadata is stored in md.Meta as a dictionary of lists
            meta = {key: " ".join(value) for key, value in md.Meta.items()} if md.Meta else {}
            return meta

    def _process_directory(self):
        """
        Recursively scan the input directory for Markdown files
        and extract metadata to build the JSON structure.
        """
        for root, _, files in os.walk(self.input_dir):
            for file in files:
                if file.endswith(".md"):
                    file_path = os.path.join(root, file)
                    metadata = self._extract_metadata(file_path)

                    if metadata:
                        # Add to 'categories'
                        self.data["categories"].append({
                            "name": metadata.get("name", "Unknown"),
                            "path": os.path.relpath(root, self.input_dir).replace(os.sep, "/"),
                            "author": metadata.get("author", "Unknown")
                        })

                        # Add to 'favorites' if 'favorite' is true
                        if metadata.get("favorite") and metadata["favorite"].lower() == "true":
                            self.data["favorites"].append({
                                "name": metadata.get("name", "Unknown"),
                                "image": metadata.get("image", "images/default.jpg"),
                                "description": metadata.get("summary", "No description provided"),
                                "path": os.path.relpath(root, self.input_dir).replace(os.sep, "/"),
                            })

    def generate_json(self):
        """
        Generate the JSON structure, deduplicate and sort categories by 'path',
        then save it to the output file.
        """
        self._process_directory()  # Extract all markdown data into self.data

        # Ensure 'categories' exists and is a list
        if "categories" not in self.data:
            self.data["categories"] = []

        # Deduplicate 'categories' using 'path' as the unique key
        unique_categories = { }
        for category in self.data["categories"]:
            if isinstance( category, dict ):  # Ensure valid category structure
                path = category.get( "path", "unknown" )  # Use 'path' as the unique key
                if path not in unique_categories:
                    unique_categories[path] = category

        # Replace the 'categories' list with a sorted version by 'path'
        self.data["categories"] = sorted(
            unique_categories.values(),
            key = lambda x: x.get( "path", "unknown" )
        )

        # Save the updated JSON to file
        with open( self.output_file, "w", encoding = "utf-8" ) as json_file:
            json.dump( self.data, json_file, indent = 4, ensure_ascii = False )

        print( f"Generated JSON saved to {self.output_file}" )
Lets test 2024-12-11 23:56:15 +01:00			`import os`
			`import markdown`
			`import json`
			`from typing import List, Dict`


			`class MetadataProcessor:`
			`"""`
			`A class to scan Markdown files, extract front matter metadata,`
			`and generate a structured JSON file.`
			`"""`

			`def __init__(self, input_dir: str, output_file: str):`
			`"""`
			`Initialize the MetadataProcessor.`

			`Args:`
			`input_dir (str): Directory containing Markdown files.`
			`output_file (str): Path to save the generated JSON file.`
			`"""`
			`self.input_dir = input_dir`
			`self.output_file = output_file`
			`self.data = {"categories": [], "favorites": []}`

			`def _extract_metadata(self, file_path: str) -> Dict:`
			`"""`
			`Extract front matter metadata using the 'markdown' package.`

			`Args:`
			`file_path (str): Path to the Markdown file.`

			`Returns:`
			`dict: A dictionary containing the extracted metadata.`
			`"""`
			`with open(file_path, "r", encoding="utf-8") as file:`
			`markdown_content = file.read()`

			`# Initialize Markdown with meta extension`
			`md = markdown.Markdown(extensions=["extra", "nl2br", "meta"])`
			`md.convert(markdown_content)`

			`# Metadata is stored in md.Meta as a dictionary of lists`
			`meta = {key: " ".join(value) for key, value in md.Meta.items()} if md.Meta else {}`
			`return meta`

			`def _process_directory(self):`
			`"""`
			`Recursively scan the input directory for Markdown files`
			`and extract metadata to build the JSON structure.`
			`"""`
			`for root, _, files in os.walk(self.input_dir):`
			`for file in files:`
			`if file.endswith(".md"):`
			`file_path = os.path.join(root, file)`
			`metadata = self._extract_metadata(file_path)`

			`if metadata:`
			`# Add to 'categories'`
			`self.data["categories"].append({`
			`"name": metadata.get("name", "Unknown"),`
			`"path": os.path.relpath(root, self.input_dir).replace(os.sep, "/"),`
			`"author": metadata.get("author", "Unknown")`
			`})`

			`# Add to 'favorites' if 'favorite' is true`
			`if metadata.get("favorite") and metadata["favorite"].lower() == "true":`
			`self.data["favorites"].append({`
			`"name": metadata.get("name", "Unknown"),`
			`"image": metadata.get("image", "images/default.jpg"),`
Lets go back 2024-12-13 23:11:30 +01:00			`"description": metadata.get("summary", "No description provided"),`
			`"path": os.path.relpath(root, self.input_dir).replace(os.sep, "/"),`
Lets test 2024-12-11 23:56:15 +01:00			`})`

			`def generate_json(self):`
			`"""`
Loads and loads of data 2024-12-12 23:30:19 +01:00			`Generate the JSON structure, deduplicate and sort categories by 'path',`
			`then save it to the output file.`
Lets test 2024-12-11 23:56:15 +01:00			`"""`
Loads and loads of data 2024-12-12 23:30:19 +01:00			`self._process_directory() # Extract all markdown data into self.data`
Lets test 2024-12-11 23:56:15 +01:00
Loads and loads of data 2024-12-12 23:30:19 +01:00			`# Ensure 'categories' exists and is a list`
			`if "categories" not in self.data:`
			`self.data["categories"] = []`
Lets test 2024-12-11 23:56:15 +01:00
Loads and loads of data 2024-12-12 23:30:19 +01:00			`# Deduplicate 'categories' using 'path' as the unique key`
			`unique_categories = { }`
			`for category in self.data["categories"]:`
			`if isinstance( category, dict ): # Ensure valid category structure`
			`path = category.get( "path", "unknown" ) # Use 'path' as the unique key`
			`if path not in unique_categories:`
			`unique_categories[path] = category`

			`# Replace the 'categories' list with a sorted version by 'path'`
			`self.data["categories"] = sorted(`
			`unique_categories.values(),`
			`key = lambda x: x.get( "path", "unknown" )`
			`)`

			`# Save the updated JSON to file`
			`with open( self.output_file, "w", encoding = "utf-8" ) as json_file:`
			`json.dump( self.data, json_file, indent = 4, ensure_ascii = False )`

			`print( f"Generated JSON saved to {self.output_file}" )`