Add a check for articles formatting
diff --git a/.ci/check_knowledge_markup.py b/.ci/check_knowledge_markup.py new file mode 100644 index 0000000..2171c8b --- /dev/null +++ b/.ci/check_knowledge_markup.py
@@ -0,0 +1,86 @@ +import re +import sys +from pathlib import Path + +MARKDOWN_LINK = re.compile(r'(?<!!)\[[^\]]+\]\([^)]+\)') +MARKDOWN_BOLD = re.compile(r'\*\*[^*\n]+\*\*') +MARKDOWN_ITALIC = re.compile(r'(?<!\*)\*(?!\*)[^*\n]+\*(?!\*)') +MARKDOWN_CODE = re.compile(r'`[^`\n]+`') +MARKDOWN_HEADING = re.compile(r'(?m)^#{1,6}\s') + +ALL_PATTERNS = [ + (MARKDOWN_LINK, "markdown link [text](url)"), + (MARKDOWN_BOLD, "markdown bold **text**"), + (MARKDOWN_ITALIC, "markdown italic *text*"), + (MARKDOWN_CODE, "markdown code `backtick`"), + (MARKDOWN_HEADING, "markdown heading #"), +] + +INLINE_TAG_RE = re.compile( + r'<(figcaption|p|li|dt|dd|td|th)[^>]*>[^\n]+</\1>', + re.IGNORECASE, +) + +ADJACENT_BLOCK_RE = re.compile( + r'<(figure|aside|div|section|blockquote)[^>]*>\n([^\n].*?)\n</\1>', + re.DOTALL | re.IGNORECASE, +) + + +def check_inline_tags(content, md_file, errors): + """Catch markdown inside single-line HTML tags like <figcaption>...</figcaption>.""" + for match in INLINE_TAG_RE.finditer(content): + line = match.group(0) + line_num = content[:match.start()].count('\n') + 1 + for pattern, description in ALL_PATTERNS: + if pattern.search(line): + errors.append( + f"{md_file}:{line_num}: {description} inside inline HTML tag:\n" + f" {line[:200]!r}" + ) + break + + +def check_adjacent_blocks(content, md_file, errors): + """Catch markdown on lines immediately adjacent to block HTML tags (no blank line).""" + for block_match in ADJACENT_BLOCK_RE.finditer(content): + inner = block_match.group(2) + if not inner.strip(): + continue + inner_no_images = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', inner) + line_num = content[:block_match.start()].count('\n') + 1 + for pattern, description in ALL_PATTERNS: + if pattern.search(inner_no_images): + errors.append( + f"{md_file}:{line_num}: {description} adjacent to HTML block tag " + f"(no blank line separator):\n" + f" {block_match.group(0)[:200].strip()!r}" + ) + break + + +def main(): + knowledge_root = Path("cc-by-sa/knowledge") + files = sorted(knowledge_root.rglob("*.md")) + + if not files: + print("No .md files found under cc-by-sa/knowledge/") + sys.exit(1) + + errors = [] + for md_file in files: + content = md_file.read_text(encoding="utf-8") + check_inline_tags(content, md_file, errors) + check_adjacent_blocks(content, md_file, errors) + + if errors: + print(f"❌ Found {len(errors)} markdown-in-HTML error(s):\n") + for e in errors: + print(f" {e}\n") + sys.exit(1) + + print(f"✅ Checked {len(files)} files — no markdown markup inside HTML blocks.") + + +if __name__ == "__main__": + main() \ No newline at end of file
diff --git a/.github/workflows/check_knowledge.yaml b/.github/workflows/check_knowledge.yaml new file mode 100644 index 0000000..bee74f2 --- /dev/null +++ b/.github/workflows/check_knowledge.yaml
@@ -0,0 +1,20 @@ +name: Check Knowledge content markup + +on: + pull_request: + paths: + - "cc-by-sa/knowledge/**/*.md" + +jobs: + check-markdown-in-html: + name: No markdown inside HTML blocks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Check for markdown markup inside HTML + run: python3 .ci/check_knowledge_markup.py \ No newline at end of file