migrate-contents-automation/generate-summaries/summarize.py

57 lines
1.9 KiB
Python

import os
import re
import shutil
def clean_md_file(input_path, output_path):
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# Remove YAML frontmatter (with surrounding ---)
content = re.sub(r'^---\n(.*?\n)*?---\n', '', content, flags=re.DOTALL)
# Remove all Hugo shortcodes like {{< ... >}} or {{< /... >}}
content = re.sub(r'{{<[^>]*>}}', '', content)
# Remove comments shortcode and everything after
content = re.sub(r'{{<\s*comments\s*>}}.*', '', content, flags=re.DOTALL)
# Remove trailing shell prompt lines (optional)
content = re.sub(r'^hacker@selfhost1:.*$', '', content, flags=re.MULTILINE)
# Collapse multiple blank lines to max two
content = re.sub(r'\n{3,}', '\n\n', content)
# Strip leading/trailing whitespace
content = content.strip()
# Write cleaned content
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Processed: {input_path} -> {output_path}")
def batch_clean_md_files():
input_dir = '/var/www/html/professional-site/content/post'
assets_dir = '/var/www/html/professional-site/assets/summaries'
static_dir = '/var/www/html/professional-site/static/summaries'
os.makedirs(assets_dir, exist_ok=True)
os.makedirs(static_dir, exist_ok=True)
for filename in os.listdir(input_dir):
if filename.endswith('.md'):
input_path = os.path.join(input_dir, filename)
output_filename = filename.replace('.md', '.txt')
output_path_assets = os.path.join(assets_dir, output_filename)
output_path_static = os.path.join(static_dir, output_filename)
clean_md_file(input_path, output_path_assets)
# Also copy the cleaned file to static folder
shutil.copyfile(output_path_assets, output_path_static)
print(f"Copied summary to static: {output_path_static}")
batch_clean_md_files()