From 5b5d29c01e69d7d569ab03695fd43acc0fcc9aeb Mon Sep 17 00:00:00 2001 From: xcrong Date: Fri, 20 Feb 2026 23:20:17 +0800 Subject: [PATCH] Add check_links.py utility script --- check_links.py | 300 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 check_links.py diff --git a/check_links.py b/check_links.py new file mode 100644 index 0000000..3fb0a7f --- /dev/null +++ b/check_links.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +""" +检测 awesome-openclaw-skills README.md 中链接的有效性。 +使用 HEAD 请求检测状态码,支持通过 GITHUB_TOKEN 环境变量提高 GitHub API 并发限制。 +""" + +import argparse +import os +import re +import ssl +import sys +import time +import urllib.request +import urllib.error +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from http.client import HTTPResponse +from typing import Optional +from urllib.parse import urlparse + + +@dataclass +class LinkResult: + """链接检测结果""" + name: str + url: str + line_num: int + original_line: str + status_code: Optional[int] + error: Optional[str] + is_valid: bool + + +def extract_links_from_readme(filepath: str) -> list[tuple[str, str, int, str]]: + """ + 从 README.md 中提取所有技能链接。 + + 返回: [(skill_name, url, line_num, original_line), ...] + """ + pattern = re.compile(r'-\s+\[([^\]]+)\]\((https://github\.com/openclaw/skills/[^\)]+)\)') + + links = [] + with open(filepath, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + match = pattern.search(line) + if match: + name, url = match.groups() + links.append((name, url, line_num, line.rstrip('\n'))) + + return links + + +def check_link(name: str, url: str, github_token: Optional[str], timeout: int = 10) -> tuple[Optional[int], Optional[str], bool]: + """ + 使用 HEAD 请求检测单个链接的有效性。 + + 对于 GitHub 链接,使用 GITHUB_TOKEN 进行认证以提高 API 限制。 + GitHub API 限制: + - 未认证:60 次/小时 + - 认证后:5000 次/小时 + + 返回: (status_code, error_msg, is_valid) + """ + # 构建请求 + parsed = urlparse(url) + + # 将 github.com 链接转换为 API 调用以获取更准确的状态 + # 例如:https://github.com/openclaw/skills/tree/main/skills/xxx/SKILL.md + # 转换为:https://api.github.com/repos/openclaw/skills/contents/skills/xxx/SKILL.md?ref=main + + is_github = parsed.netloc == 'github.com' + + if is_github: + # 解析 GitHub URL 路径 + path_parts = parsed.path.split('/') + # /openclaw/skills/tree/main/skills/author/skill-name/SKILL.md + if len(path_parts) >= 6 and path_parts[3] == 'tree': + repo_owner = path_parts[1] + repo_name = path_parts[2] + branch = path_parts[4] + file_path = '/'.join(path_parts[5:]) + + # 构建 GitHub API URL + api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}?ref={branch}" + check_url = api_url + else: + check_url = url + else: + check_url = url + + # 创建请求 + req = urllib.request.Request(check_url, method='HEAD') + + # 设置请求头 + req.add_header('User-Agent', 'awesome-openclaw-skills-link-checker/1.0') + + if is_github and github_token: + req.add_header('Authorization', f'token {github_token}') + # GitHub API 需要 Accept 头 + req.add_header('Accept', 'application/vnd.github.v3+json') + + # 创建 SSL 上下文 + ssl_context = ssl.create_default_context() + + try: + with urllib.request.urlopen(req, timeout=timeout, context=ssl_context) as response: + # 对于 GitHub API,HEAD 请求可能不支持,需要处理 + if isinstance(response, HTTPResponse): + status_code = response.status + else: + status_code = 200 + + is_valid = 200 <= status_code < 400 + + return (status_code, None, is_valid) + + except urllib.error.HTTPError as e: + status_code = e.code + error_msg = None + + if status_code == 404: + error_msg = "Not Found" + is_valid = False + elif status_code == 403: + error_msg = "Forbidden (rate limited?)" + # 速率限制表示资源存在,只是暂时无法访问 + is_valid = True + elif status_code == 429: + error_msg = "Too Many Requests" + # 速率限制表示资源存在,只是暂时无法访问 + is_valid = True + else: + error_msg = f"HTTP {status_code}" + is_valid = False + + return (status_code, error_msg, is_valid) + + except urllib.error.URLError as e: + return (None, f"URL Error: {e.reason}", False) + + except TimeoutError: + return (None, "Timeout", False) + + except Exception as e: + return (None, f"Error: {str(e)}", False) + + +def check_all_links( + links: list[tuple[str, str, int, str]], + github_token: Optional[str], + max_workers: int = 10, + rate_limit_delay: float = 0.1 +) -> list[LinkResult]: + """ + 并发检测所有链接。 + + 参数: + links: [(name, url, line_num, original_line), ...] + github_token: GitHub 个人访问令牌 + max_workers: 最大并发数 + rate_limit_delay: 每次请求之间的延迟(秒) + """ + results = [] + total = len(links) + + print(f"开始检测 {total} 个链接...") + print(f"并发数: {max_workers}") + print(f"GITHUB_TOKEN: {'已设置' if github_token else '未设置 (限制: 60次/小时)'}") + print("-" * 60) + + def check_with_delay(link_tuple): + name, url, line_num, original_line = link_tuple + status_code, error, is_valid = check_link(name, url, github_token) + time.sleep(rate_limit_delay) # 添加延迟以避免触发速率限制 + return LinkResult( + name=name, + url=url, + line_num=line_num, + original_line=original_line, + status_code=status_code, + error=error, + is_valid=is_valid + ) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(check_with_delay, link): link + for link in links + } + + completed = 0 + for future in as_completed(futures): + completed += 1 + result = future.result() + results.append(result) + + # 显示进度(始终打印 URL) + status_icon = "✓" if result.is_valid else "✗" + if result.is_valid: + print(f"[{completed}/{total}] {status_icon} {result.name}") + print(f" {result.url}") + else: + error_info = result.error or f"HTTP {result.status_code}" + print(f"[{completed}/{total}] {status_icon} {result.name} - {error_info}") + print(f" {result.url}") + + return results + + +def delete_invalid_lines(readme_path: str, results: list[LinkResult]) -> int: + """ + 删除 README.md 中无效链接所在的行。 + + 返回: 删除的行数 + """ + # 收集需要删除的行号 + invalid_lines = {r.line_num for r in results if not r.is_valid} + + if not invalid_lines: + return 0 + + # 读取所有行 + with open(readme_path, 'r', encoding='utf-8') as f: + lines = f.readlines() + + # 过滤掉无效行 + new_lines = [ + line for line_num, line in enumerate(lines, 1) + if line_num not in invalid_lines + ] + + # 写回文件 + with open(readme_path, 'w', encoding='utf-8') as f: + f.writelines(new_lines) + + return len(invalid_lines) + + +def main(): + # 解析命令行参数 + parser = argparse.ArgumentParser(description='检测 README.md 中链接的有效性') + parser.add_argument('--delete', action='store_true', help='删除无效链接所在的行') + args = parser.parse_args() + + # 获取 README.md 路径 + script_dir = os.path.dirname(os.path.abspath(__file__)) + readme_path = os.path.join(script_dir, "README.md") + + if not os.path.exists(readme_path): + print(f"错误: 找不到 README.md 文件: {readme_path}") + sys.exit(1) + + # 获取 GITHUB_TOKEN + github_token = os.environ.get("GITHUB_TOKEN") + + # 提取链接 + print(f"正在读取 {readme_path}...") + links = extract_links_from_readme(readme_path) + print(f"找到 {len(links)} 个链接") + print() + + if not links: + print("没有找到任何链接") + sys.exit(0) + + # 检测链接 + # 对于 GitHub API,如果使用 token,可以更高并发 + # 未使用 token 时,降低并发以避免触发速率限制 + max_workers = 20 if github_token else 5 + rate_limit_delay = 0.05 if github_token else 0.5 + + results = check_all_links( + links, + github_token, + max_workers=max_workers, + rate_limit_delay=rate_limit_delay + ) + + # 统计结果 + print() + print("=" * 60) + valid_count = sum(1 for r in results if r.is_valid) + invalid_count = len(results) - valid_count + print(f"检测完成: 有效 {valid_count}, 无效 {invalid_count}") + + # 如果需要,删除无效行 + if args.delete and invalid_count > 0: + print() + print("正在删除无效链接...") + deleted = delete_invalid_lines(readme_path, results) + print(f"已删除 {deleted} 行") + + # 返回退出码 + if invalid_count > 0: + sys.exit(1) + sys.exit(0) + + +if __name__ == "__main__": + main() \ No newline at end of file