#!/usr/bin/env python3
"""
FAQ RSS Validator
Validates FAQ.RSS for well-formed XML and site-specific constraints.
Checks for no CDATA, allowed title values, and proper HTML escaping.
"""
import sys
import xml.etree.ElementTree as ET
import re
from pathlib import Path
class FAQRSSValidator:
def __init__(self, rss_file="FAQ.RSS"):
self.rss_file = Path(rss_file)
self.errors = []
self.warnings = []
self.allowed_titles = {
"Config Files",
"Startup Parameters",
"Troubleshooting",
"Steam Workshop"
}
def validate_xml_structure(self):
"""Validate basic XML structure and parsing"""
try:
tree = ET.parse(self.rss_file)
self.root = tree.getroot()
if self.root.tag != 'rss':
self.errors.append("Root element must be 'rss'")
return False
channel = self.root.find('channel')
if channel is None:
self.errors.append("Missing 'channel' element")
return False
self.channel = channel
return True
except ET.ParseError as e:
self.errors.append(f"XML parsing error: {e}")
return False
except FileNotFoundError:
self.errors.append(f"RSS file not found: {self.rss_file}")
return False
except Exception as e:
self.errors.append(f"Unexpected error parsing XML: {e}")
return False
def check_no_cdata(self):
"""Check that there are no CDATA sections in the file"""
try:
with open(self.rss_file, 'r', encoding='utf-8') as f:
content = f.read()
if ' (except for allowed tags)
allowed_tags = ['br', 'strong', '/strong']
# Find all < and > characters
for match in re.finditer(r'<([^>]*)>', content):
tag = match.group(1).strip()
if tag not in allowed_tags and not tag.startswith('<') and not tag.startswith('>'):
# Check if it's properly escaped
if not tag.startswith('&') or not tag.endswith(';'):
self.errors.append(f"Item {i}: Unescaped HTML tag '<{tag}>'")
# Check for unescaped & that aren't part of entities
unescaped_amp = re.findall(r'&(?![a-zA-Z0-9#]+;)', content)
if unescaped_amp:
self.errors.append(f"Item {i}: Unescaped ampersand(s) found")
def check_alphabetical_order(self):
"""Check that categories are in alphabetical order"""
items = self.channel.findall('item')
categories = []
for item in items:
category_elem = item.find('category')
title_elem = item.find('title')
if category_elem is not None and title_elem is not None:
categories.append((category_elem.text, title_elem.text))
# Check if categories are sorted
sorted_categories = sorted(categories, key=lambda x: (x[0], x[1]))
if categories != sorted_categories:
self.warnings.append("Items are not in alphabetical order by category then title")
def check_duplicate_items(self):
"""Check for duplicate category/title combinations"""
items = self.channel.findall('item')
seen = set()
for i, item in enumerate(items, 1):
category_elem = item.find('category')
title_elem = item.find('title')
if category_elem is not None and title_elem is not None:
key = (category_elem.text, title_elem.text)
if key in seen:
self.errors.append(f"Item {i}: Duplicate category/title combination: {key}")
seen.add(key)
def check_content_formatting(self):
"""Check content formatting requirements"""
items = self.channel.findall('item')
for i, item in enumerate(items, 1):
content_elem = item.find('{http://purl.org/rss/1.0/modules/content/}encoded')
if content_elem is None or not content_elem.text:
continue
content = content_elem.text
# Check for proper use of
instead of newlines
if '\n' in content and '
' not in content:
self.warnings.append(f"Item {i}: Consider using
tags instead of newlines")
# Check for unescaped strong tags (should be <strong>)
if '' in content and '<strong>' not in content:
self.warnings.append(f"Item {i}: Use <strong> instead of tags")
def generate_statistics(self):
"""Generate statistics about the RSS file"""
items = self.channel.findall('item')
categories = {}
for item in items:
category_elem = item.find('category')
if category_elem is not None and category_elem.text:
categories[category_elem.text] = categories.get(category_elem.text, 0) + 1
print("\nStatistics:")
print(f"Total items: {len(items)}")
print(f"Total categories: {len(categories)}")
if categories:
print("\nItems per category:")
for category in sorted(categories.keys()):
print(f" {category}: {categories[category]} items")
def validate(self):
"""Run all validation checks"""
print(f"Validating RSS file: {self.rss_file}")
print("=" * 50)
# Basic XML structure validation
if not self.validate_xml_structure():
return False
# Run all validation checks
self.check_no_cdata()
self.check_allowed_titles()
self.check_required_elements()
self.check_html_escaping()
self.check_alphabetical_order()
self.check_duplicate_items()
self.check_content_formatting()
# Report results
print(f"Found {len(self.errors)} errors and {len(self.warnings)} warnings")
if self.errors:
print(f"\n❌ ERRORS ({len(self.errors)}):")
for error in self.errors:
print(f" • {error}")
if self.warnings:
print(f"\n⚠️ WARNINGS ({len(self.warnings)}):")
for warning in self.warnings:
print(f" • {warning}")
if not self.errors and not self.warnings:
print("\n✅ All validation checks passed!")
# Generate statistics
self.generate_statistics()
return len(self.errors) == 0
def run(self):
"""Main execution method"""
return self.validate()
def main():
if len(sys.argv) > 1:
rss_file = sys.argv[1]
else:
rss_file = "FAQ.RSS"
validator = FAQRSSValidator(rss_file)
success = validator.run()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()