260 lines
No EOL
9.9 KiB
Python
Executable file
260 lines
No EOL
9.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
FAQ RSS Validator
|
|
Validates FAQ.RSS for well-formed XML and site-specific constraints.
|
|
Checks for no CDATA, allowed title values, and proper HTML escaping.
|
|
"""
|
|
|
|
import sys
|
|
import xml.etree.ElementTree as ET
|
|
import re
|
|
from pathlib import Path
|
|
|
|
class FAQRSSValidator:
|
|
def __init__(self, rss_file="FAQ.RSS"):
|
|
self.rss_file = Path(rss_file)
|
|
self.errors = []
|
|
self.warnings = []
|
|
self.allowed_titles = {
|
|
"Config Files",
|
|
"Startup Parameters",
|
|
"Troubleshooting",
|
|
"Steam Workshop"
|
|
}
|
|
|
|
def validate_xml_structure(self):
|
|
"""Validate basic XML structure and parsing"""
|
|
try:
|
|
tree = ET.parse(self.rss_file)
|
|
self.root = tree.getroot()
|
|
|
|
if self.root.tag != 'rss':
|
|
self.errors.append("Root element must be 'rss'")
|
|
return False
|
|
|
|
channel = self.root.find('channel')
|
|
if channel is None:
|
|
self.errors.append("Missing 'channel' element")
|
|
return False
|
|
|
|
self.channel = channel
|
|
return True
|
|
|
|
except ET.ParseError as e:
|
|
self.errors.append(f"XML parsing error: {e}")
|
|
return False
|
|
except FileNotFoundError:
|
|
self.errors.append(f"RSS file not found: {self.rss_file}")
|
|
return False
|
|
except Exception as e:
|
|
self.errors.append(f"Unexpected error parsing XML: {e}")
|
|
return False
|
|
|
|
def check_no_cdata(self):
|
|
"""Check that there are no CDATA sections in the file"""
|
|
try:
|
|
with open(self.rss_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
if '<![CDATA[' in content:
|
|
self.errors.append("CDATA sections are not allowed")
|
|
# Find specific locations
|
|
lines = content.split('\n')
|
|
for i, line in enumerate(lines, 1):
|
|
if '<![CDATA[' in line:
|
|
self.errors.append(f" CDATA found at line {i}")
|
|
|
|
except Exception as e:
|
|
self.errors.append(f"Error reading file for CDATA check: {e}")
|
|
|
|
def check_allowed_titles(self):
|
|
"""Check that all item titles are from allowed set"""
|
|
items = self.channel.findall('item')
|
|
|
|
for i, item in enumerate(items, 1):
|
|
title_elem = item.find('title')
|
|
if title_elem is None:
|
|
self.errors.append(f"Item {i}: Missing title element")
|
|
continue
|
|
|
|
title = title_elem.text
|
|
if title not in self.allowed_titles:
|
|
self.errors.append(f"Item {i}: Invalid title '{title}'. Allowed: {', '.join(sorted(self.allowed_titles))}")
|
|
|
|
def check_required_elements(self):
|
|
"""Check that each item has required elements"""
|
|
items = self.channel.findall('item')
|
|
|
|
for i, item in enumerate(items, 1):
|
|
# Check for title
|
|
if item.find('title') is None:
|
|
self.errors.append(f"Item {i}: Missing title element")
|
|
|
|
# Check for category
|
|
category_elem = item.find('category')
|
|
if category_elem is None:
|
|
self.errors.append(f"Item {i}: Missing category element")
|
|
elif not category_elem.text or not category_elem.text.strip():
|
|
self.errors.append(f"Item {i}: Empty category")
|
|
|
|
# Check for content:encoded (with namespace)
|
|
content_elem = item.find('{http://purl.org/rss/1.0/modules/content/}encoded')
|
|
if content_elem is None:
|
|
self.errors.append(f"Item {i}: Missing content:encoded element")
|
|
elif not content_elem.text or not content_elem.text.strip():
|
|
self.warnings.append(f"Item {i}: Empty content")
|
|
|
|
def check_html_escaping(self):
|
|
"""Check that HTML content is properly escaped"""
|
|
items = self.channel.findall('item')
|
|
|
|
for i, item in enumerate(items, 1):
|
|
content_elem = item.find('{http://purl.org/rss/1.0/modules/content/}encoded')
|
|
if content_elem is None or not content_elem.text:
|
|
continue
|
|
|
|
content = content_elem.text
|
|
|
|
# Check for unescaped < and > (except for allowed tags)
|
|
allowed_tags = ['br', 'strong', '/strong']
|
|
|
|
# Find all < and > characters
|
|
for match in re.finditer(r'<([^>]*)>', content):
|
|
tag = match.group(1).strip()
|
|
if tag not in allowed_tags and not tag.startswith('<') and not tag.startswith('>'):
|
|
# Check if it's properly escaped
|
|
if not tag.startswith('&') or not tag.endswith(';'):
|
|
self.errors.append(f"Item {i}: Unescaped HTML tag '<{tag}>'")
|
|
|
|
# Check for unescaped & that aren't part of entities
|
|
unescaped_amp = re.findall(r'&(?![a-zA-Z0-9#]+;)', content)
|
|
if unescaped_amp:
|
|
self.errors.append(f"Item {i}: Unescaped ampersand(s) found")
|
|
|
|
def check_alphabetical_order(self):
|
|
"""Check that categories are in alphabetical order"""
|
|
items = self.channel.findall('item')
|
|
categories = []
|
|
|
|
for item in items:
|
|
category_elem = item.find('category')
|
|
title_elem = item.find('title')
|
|
|
|
if category_elem is not None and title_elem is not None:
|
|
categories.append((category_elem.text, title_elem.text))
|
|
|
|
# Check if categories are sorted
|
|
sorted_categories = sorted(categories, key=lambda x: (x[0], x[1]))
|
|
|
|
if categories != sorted_categories:
|
|
self.warnings.append("Items are not in alphabetical order by category then title")
|
|
|
|
def check_duplicate_items(self):
|
|
"""Check for duplicate category/title combinations"""
|
|
items = self.channel.findall('item')
|
|
seen = set()
|
|
|
|
for i, item in enumerate(items, 1):
|
|
category_elem = item.find('category')
|
|
title_elem = item.find('title')
|
|
|
|
if category_elem is not None and title_elem is not None:
|
|
key = (category_elem.text, title_elem.text)
|
|
if key in seen:
|
|
self.errors.append(f"Item {i}: Duplicate category/title combination: {key}")
|
|
seen.add(key)
|
|
|
|
def check_content_formatting(self):
|
|
"""Check content formatting requirements"""
|
|
items = self.channel.findall('item')
|
|
|
|
for i, item in enumerate(items, 1):
|
|
content_elem = item.find('{http://purl.org/rss/1.0/modules/content/}encoded')
|
|
if content_elem is None or not content_elem.text:
|
|
continue
|
|
|
|
content = content_elem.text
|
|
|
|
# Check for proper use of <br> instead of newlines
|
|
if '\n' in content and '<br>' not in content:
|
|
self.warnings.append(f"Item {i}: Consider using <br> tags instead of newlines")
|
|
|
|
# Check for unescaped strong tags (should be <strong>)
|
|
if '<strong>' in content and '<strong>' not in content:
|
|
self.warnings.append(f"Item {i}: Use <strong> instead of <strong> tags")
|
|
|
|
def generate_statistics(self):
|
|
"""Generate statistics about the RSS file"""
|
|
items = self.channel.findall('item')
|
|
categories = {}
|
|
|
|
for item in items:
|
|
category_elem = item.find('category')
|
|
if category_elem is not None and category_elem.text:
|
|
categories[category_elem.text] = categories.get(category_elem.text, 0) + 1
|
|
|
|
print("\nStatistics:")
|
|
print(f"Total items: {len(items)}")
|
|
print(f"Total categories: {len(categories)}")
|
|
|
|
if categories:
|
|
print("\nItems per category:")
|
|
for category in sorted(categories.keys()):
|
|
print(f" {category}: {categories[category]} items")
|
|
|
|
def validate(self):
|
|
"""Run all validation checks"""
|
|
print(f"Validating RSS file: {self.rss_file}")
|
|
print("=" * 50)
|
|
|
|
# Basic XML structure validation
|
|
if not self.validate_xml_structure():
|
|
return False
|
|
|
|
# Run all validation checks
|
|
self.check_no_cdata()
|
|
self.check_allowed_titles()
|
|
self.check_required_elements()
|
|
self.check_html_escaping()
|
|
self.check_alphabetical_order()
|
|
self.check_duplicate_items()
|
|
self.check_content_formatting()
|
|
|
|
# Report results
|
|
print(f"Found {len(self.errors)} errors and {len(self.warnings)} warnings")
|
|
|
|
if self.errors:
|
|
print(f"\n❌ ERRORS ({len(self.errors)}):")
|
|
for error in self.errors:
|
|
print(f" • {error}")
|
|
|
|
if self.warnings:
|
|
print(f"\n⚠️ WARNINGS ({len(self.warnings)}):")
|
|
for warning in self.warnings:
|
|
print(f" • {warning}")
|
|
|
|
if not self.errors and not self.warnings:
|
|
print("\n✅ All validation checks passed!")
|
|
|
|
# Generate statistics
|
|
self.generate_statistics()
|
|
|
|
return len(self.errors) == 0
|
|
|
|
def run(self):
|
|
"""Main execution method"""
|
|
return self.validate()
|
|
|
|
def main():
|
|
if len(sys.argv) > 1:
|
|
rss_file = sys.argv[1]
|
|
else:
|
|
rss_file = "FAQ.RSS"
|
|
|
|
validator = FAQRSSValidator(rss_file)
|
|
success = validator.run()
|
|
|
|
sys.exit(0 if success else 1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |