migrate-contents-automation/sync-content-pro-site/sync-to-pro

#!/usr/bin/env python3

import os
import re
import shutil
import logging
from pathlib import Path
import pwd
import grp
import filecmp

# ==== CONFIG ====
ARUL_SPACE_CONTENT = "/var/www/html/arulbalaji.xyz/content/journal"
PRO_CONTENT = "/var/www/html/professional-site/content/post"
ARUL_SPACE_MEDIA = "/var/www/html/arulbalaji.xyz/static/media"
PRO_MEDIA = "/var/www/html/professional-site/static/media"
SUMMARY_FOLDER = "/var/www/html/professional-site/assets/summaries"
LOG_FILE = "/var/log/hugo_sync.log"
VALID_TAGS = {"Tech", "Tech tutorials"}
OWNER_USER = "arul"  # Set to None to skip chown

# ==== LOGGER SETUP ====
logger = logging.getLogger("SyncLogger")
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')

file_handler = logging.FileHandler(LOG_FILE)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

# ==== HELPERS ====

def parse_frontmatter(lines):
    front = {}
    in_frontmatter = False
    body_start = 0
    for i, line in enumerate(lines):
        if line.strip() == "---":
            if not in_frontmatter:
                in_frontmatter = True
            else:
                body_start = i + 1
                break
        elif in_frontmatter and ':' in line:
            key, value = line.split(':', 1)
            front[key.strip()] = value.strip().strip('"')
    return front, body_start

def extract_meta_comments(lines):
    description = None
    featured = None
    for line in lines:
        if "#pro-site description:" in line:
            description = line.split(":", 1)[1].strip()
        if "#pro-site featured_image:" in line:
            featured = line.split(":", 1)[1].strip()
    return description, featured

def has_valid_tags(front):
    tags_line = front.get("tags", "")
    tags = re.findall(r'"(.*?)"', tags_line)
    return any(tag in VALID_TAGS for tag in tags)

def chown_to_user(file_path, username):
    if not username:
        return
    try:
        uid = pwd.getpwnam(username).pw_uid
        gid = grp.getgrnam(username).gr_gid
        os.chown(file_path, uid, gid)
    except Exception as e:
        logger.warning(f"Could not chown {file_path}: {e}")

def strip_shortcodes(content):
    return re.sub(r"{{<[^>]+>}}", "", content)

def files_are_identical(file1, file2):
    try:
        if os.path.getsize(file1) != os.path.getsize(file2):
            return False
        return filecmp.cmp(file1, file2, shallow=False)
    except Exception as e:
        logger.warning(f"Error comparing files {file1} and {file2}: {e}")
        return False

# ==== TRANSFORMATION ====

def transform_file(file_path):
    try:
        with open(file_path, "r") as f:
            lines = f.readlines()
    except PermissionError:
        logger.error(f"Permission denied: {file_path}")
        return None, None

    if any(part.startswith('.') for part in file_path.parts):
        return None, None

    front, body_start = parse_frontmatter(lines)
    if not has_valid_tags(front):
        return None, None

    description, featured_image = extract_meta_comments(lines)
    title = front.get("title", "Untitled")
    raw_date = front.get("date", "1970-01-01")
    formatted_date = f"{raw_date}T00:00:00+05:30"

    body = "".join(lines[body_start:]).strip()

    new_frontmatter = f"""---
date: {formatted_date}
description: "{description or ''}"
featured_image: "{featured_image or ''}"
title: "{title}"
---
"""

    full_markdown = new_frontmatter + "\n\n" + body
    plain_text_summary = strip_shortcodes(body).strip()

    return full_markdown, plain_text_summary

# ==== MEDIA FOLDER COMPARISON ====

def are_folders_identical(folder1, folder2):
    def get_all_files(base_folder):
        file_set = set()
        for root, _, files in os.walk(base_folder):
            for f in files:
                full_path = os.path.join(root, f)
                rel_path = os.path.relpath(full_path, base_folder)
                file_set.add(rel_path)
        return file_set

    if not os.path.exists(folder1) or not os.path.exists(folder2):
        return False

    files1 = get_all_files(folder1)
    files2 = get_all_files(folder2)

    if files1 != files2:
        return False

    for rel_file in files1:
        file1 = os.path.join(folder1, rel_file)
        file2 = os.path.join(folder2, rel_file)
        if not files_are_identical(file1, file2):
            return False

    return True

# ==== MAIN SYNC CHECK ====

def check_already_synced():
    for md_file in Path(ARUL_SPACE_CONTENT).rglob("*.md"):
        transformed, _ = transform_file(md_file)
        if not transformed:
            continue
        pro_file_path = Path(PRO_CONTENT) / md_file.name
        if not pro_file_path.exists():
            return False
        try:
            with open(pro_file_path, "r") as f:
                pro_content = f.read()
            if pro_content != transformed:
                return False
        except Exception as e:
            logger.warning(f"Could not read {pro_file_path}: {e}")
            return False
    return True

# ==== MEDIA SYNC ====

def copy_media_folder():
    if not os.path.exists(ARUL_SPACE_MEDIA):
        logger.warning(f"Media folder not found in Arul's space at {ARUL_SPACE_MEDIA}")
        return
    try:
        if os.path.exists(PRO_MEDIA):
            shutil.rmtree(PRO_MEDIA)
        shutil.copytree(ARUL_SPACE_MEDIA, PRO_MEDIA, dirs_exist_ok=True)
        if OWNER_USER:
            for root, dirs, files in os.walk(PRO_MEDIA):
                for name in files:
                    chown_to_user(os.path.join(root, name), OWNER_USER)
        logger.info("📁 Media folder synced from Arul's space → professional site")
    except Exception as e:
        logger.error(f"Failed to sync media folder from Arul's space: {e}")

# ==== MAIN SYNC ====

def sync_markdowns():
    synced = []
    Path(PRO_CONTENT).mkdir(parents=True, exist_ok=True)
    Path(SUMMARY_FOLDER).mkdir(parents=True, exist_ok=True)

    for md_file in Path(ARUL_SPACE_CONTENT).rglob("*.md"):
        try:
            transformed, summary = transform_file(md_file)
            if transformed:
                out_path = Path(PRO_CONTENT) / md_file.name
                with open(out_path, "w") as out_file:
                    out_file.write(transformed)
                if OWNER_USER:
                    chown_to_user(out_path, OWNER_USER)
                synced.append(str(md_file.name))

                summary_path = Path(SUMMARY_FOLDER) / (md_file.stem + ".txt")
                with open(summary_path, "w") as sum_file:
                    sum_file.write(summary)
                if OWNER_USER:
                    chown_to_user(summary_path, OWNER_USER)
        except Exception as e:
            logger.error(f"Error processing {md_file.name}: {e}")
    return synced

# ==== EXECUTION ====

if __name__ == "__main__":
    if check_already_synced() and are_folders_identical(ARUL_SPACE_MEDIA, PRO_MEDIA):
        logger.info("☑️ Already in sync. No changes needed.")
    else:
        synced_files = sync_markdowns()
        copy_media_folder()

        if synced_files:
            for f in synced_files:
                logger.info(f"Synced: {f}")
            logger.info(f"✅ Synced files: {', '.join(synced_files)}")
        else:
            logger.info("☑️ No new valid tech/tutorial markdowns to sync.")