doc: Add ai doc translate tool and add English doc. (#1168)

- add tools/doc_translator.py - translated docs into English Signed-off-by: longjin <longjin@DragonOS.org>
2025-06-18 16:26:31 +00:00 · 2025-05-20 10:44:28 +08:00
parent fccfa6f7ff
commit 880720250e
98 changed files with 13972 additions and 6 deletions
--- a/tools/doc_translator.py
+++ b/tools/doc_translator.py
@ -0,0 +1,467 @@
+#!/usr/bin/env python3
+"""
+DragonOS文档自动翻译工具
+Usage:
+在DragonOS源码根目录下运行此脚本。
+
+需要先进入docs目录，执行命令安装依赖包。
+pip install -r requirements.txt
+
+接着先声明以下变量：
+export OPENAI_API_KEY=your_api_key
+export OPENAI_MODEL=your_model_name (推荐qwen3的4b以上的)
+export OPENAI_BASE_URL=your_openai_base_url
+export MAX_WORKERS=your_max_workers (推荐2-20)
+
+然后运行：
+python3 tools/doc_translator.py
+"""
+
+import os
+import re
+import hashlib
+import json
+from pathlib import Path
+import threading
+from typing import List, Dict, Tuple
+import openai
+import datetime
+import time
+
+from tqdm import tqdm
+
+# 配置
+
+
+def get_env_var(name, required=False, default=None):
+    """从环境变量获取配置"""
+    value = os.getenv(name, default)
+    if required and not value:
+        raise ValueError(f"环境变量 {name} 未设置")
+    return value
+
+
+CONFIG = {
+    "source_dir": "docs",  # 源文档目录
+    "target_languages": {
+        "en": "English",
+    },
+    "dirs_exclude": ["_build", "locales"],  # 排除的目录
+    "model": get_env_var("OPENAI_MODEL", default="qwen3:4b"),  # 模型名称
+    # API地址
+    "base_url": get_env_var("OPENAI_BASE_URL", default="http://localhost:11434/v1"),
+    "chunk_size": 1000,  # 分块大小(tokens)
+    "cache_file": "docs/.translation_cache.json",  # 翻译缓存文件
+    "max_workers": int(get_env_var("MAX_WORKERS", default="1")),  # 并行工作数
+    # 元数据模板
+    "meta_templates": {
+        ".rst": (
+            ".. note:: AI Translation Notice\n\n"
+            "   This document was automatically translated by `{model}` model, for reference only.\n\n"
+            "   - Source document: {original_path}\n\n"
+            "   - Translation time: {timestamp}\n\n"
+            "   - Translation model: `{model}`\n\n"
+            "\n   Please report issues via `Community Channel <https://github.com/DragonOS-Community/DragonOS/issues>`_\n\n"
+        ),
+        ".md": (
+            ":::{note}\n"
+            "**AI Translation Notice**\n\n"
+            "This document was automatically translated by `{model}` model, for reference only.\n\n"
+            "- Source document: {original_path}\n\n"
+            "- Translation time: {timestamp}\n\n"
+            "- Translation model: `{model}`\n\n"
+            "Please report issues via [Community Channel](https://github.com/DragonOS-Community/DragonOS/issues)\n\n"
+            ":::\n\n"
+        )
+    }
+}
+
+
+class LabelManager:
+    """管理文档标签和引用"""
+
+    def __init__(self, lang: str):
+        self.label_map = {}
+        self.prefix = "_translated_label_"
+        self.lang = lang
+
+    def register_label(self, original_label: str) -> str:
+        """注册新标签并返回映射后的标签"""
+        if original_label not in self.label_map:
+            new_label = f"{self.prefix}_{original_label}_{self.lang}"
+            self.label_map[original_label] = new_label
+        return self.label_map[original_label]
+
+    def get_all_labels(self) -> Dict[str, str]:
+        """获取所有标签映射"""
+        return self.label_map
+
+
+class DocumentTranslator:
+    def __init__(self):
+        self._cache_lock = threading.Lock()
+        self._cache = self._load_cache()
+        self.fail_count = 0
+        try:
+            self.client = openai.OpenAI(
+                base_url=CONFIG["base_url"],
+                # 这是故意把key的获取写在这里的。防止哪个二货直接print CONFIG导致key泄露。
+                api_key=get_env_var("OPENAI_API_KEY", default="ollama"),
+            )
+        except Exception as e:
+            raise RuntimeError(f"OpenAI客户端初始化失败: {str(e)}")
+
+    def _load_cache(self) -> Dict:
+        """加载翻译缓存"""
+        if os.path.exists(CONFIG["cache_file"]):
+            with open(CONFIG["cache_file"], "r", encoding="utf-8") as f:
+                try:
+                    return json.load(f)
+                except json.JSONDecodeError:
+                    pass
+        return {}
+
+    def _save_cache(self):
+        """保存翻译缓存"""
+        with self._cache_lock:
+            with open(CONFIG["cache_file"], "w", encoding="utf-8") as f:
+                json.dump(self._cache, f, ensure_ascii=False, indent=2)
+
+    def _get_cache_key(self, filepath: str, lang: str) -> str:
+        """生成缓存键(包含语言代码)"""
+        rel_path = os.path.relpath(filepath, CONFIG["source_dir"])
+        return f"{lang}:{rel_path}"
+
+    def _split_into_chunks(self, text: str) -> List[str]:
+        """将文本分块"""
+        # 按段落分割
+        paragraphs = re.split(r"\n\s*\n", text)
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for para in paragraphs:
+            para_size = len(para.split())
+            if current_size + para_size > CONFIG["chunk_size"] and current_chunk:
+                chunks.append("\n\n".join(current_chunk))
+                current_chunk = []
+                current_size = 0
+            current_chunk.append(para)
+            current_size += para_size
+
+        if current_chunk:
+            chunks.append("\n\n".join(current_chunk))
+
+        return chunks
+
+    def _process_rst_labels(self, text: str, label_manager: LabelManager) -> str:
+        """处理reStructuredText标签"""
+        def replace_label(match):
+            original_label = match.group(1)
+            new_label = label_manager.register_label(original_label)
+            return f'.. {new_label}:'
+
+        # 处理标签定义
+        text = re.sub(r'\.\.\s+_([^:]+):', replace_label, text)
+        # 处理标签引用
+        text = re.sub(r'(?<!\w)`([^`]+)`(?!\w)',
+                      lambda m: f'`{label_manager.register_label(m.group(1))}`',
+                      text)
+        return text
+
+    def _process_md_labels(self, text: str, label_manager: LabelManager) -> str:
+        """处理Markdown标签"""
+        # 处理显式标签定义 {#label}
+        text = re.sub(r'\{#([^}]+)\}',
+                      lambda m: f'{{#{label_manager.register_label(m.group(1))}}}',
+                      text)
+
+        # 处理显式标签定义 (label)=
+        text = re.sub(r'\(([^)]+)\)=',
+                      lambda m: f'({label_manager.register_label(m.group(1))})=',
+                      text)
+
+        # 处理标签引用 [text](#label)
+        text = re.sub(r'\[([^\]]+)\]\(#([^)]+)\)',
+                      lambda m: f'[{m.group(1)}](#{label_manager.register_label(m.group(2))})',
+                      text)
+
+        # 处理裸标签引用 #label
+        text = re.sub(r'(?<!\w)#([\w-]+)(?!\w)',
+                      lambda m: f'#{label_manager.register_label(m.group(1))}',
+                      text)
+
+        return text
+
+    def _generate_unique_label_for_lang(self, text: str, lang: str) -> str:
+        # 处理标签
+
+        label_manager = LabelManager(lang)
+        text = self._process_rst_labels(text, label_manager)
+        text = self._process_md_labels(text, label_manager)
+        return text
+
+    def _preserve_special_format(self, text: str) -> Tuple[str, Dict]:
+        """保留特殊格式"""
+        preserved = {}
+
+        # 排除不需要翻译的块
+        exclude_blocks = re.findall(
+            r'\.\. Note: __EXCLUDE_IN_TRANSLATED_START.*?\.\. Note: __EXCLUDE_IN_TRANSLATED_END',
+            text, re.DOTALL)
+        for block in exclude_blocks:
+            text = text.replace(block, '')
+
+        # 处理多行代码块
+        code_blocks = re.findall(r"```.*?\n.*?```", text, re.DOTALL)
+        for i, block in enumerate(code_blocks):
+            placeholder = f"__CODE_BLOCK_{i}__"
+            preserved[placeholder] = block
+            text = text.replace(block, placeholder)
+        # 处理内联代码块
+        inline_code = re.findall(r"`[^`]+`", text)
+        for i, code in enumerate(inline_code):
+            placeholder = f"__INLINE_CODE_{i}__"
+            preserved[placeholder] = code
+            text = text.replace(code, placeholder)
+
+        return text, preserved
+
+    def _restore_special_format(self, text: str, preserved: Dict) -> str:
+        """恢复特殊格式"""
+        # 先恢复内联代码块
+        for placeholder, content in preserved.items():
+            if placeholder.startswith("__INLINE_CODE_"):
+                text = text.replace(placeholder, content)
+
+        # 然后恢复多行代码块
+        for placeholder, content in preserved.items():
+            if placeholder.startswith("__CODE_BLOCK_"):
+                text = text.replace(placeholder, content)
+
+        return text
+
+    def _remove_thinking(self, text: str) -> str:
+        """Remove <think> tags from text"""
+        return re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+
+    def _translate_chunk(self, args: Tuple[str, str]) -> str:
+        """翻译单个文本块(内部方法，用于并行处理)"""
+        chunk, lang = args
+        retry = 3
+        while retry > 0:
+            try:
+                lang_name = CONFIG["target_languages"].get(lang, "English")
+                prompt = f"你是一个专业的文档翻译助手，请将以下中文技术文档准确翻译成{lang_name}，保持技术术语的正确性和格式不变。"
+
+                # disable qwen3's thinking mode
+                if "qwen3" in CONFIG["model"].lower():
+                    prompt += "\n/no_think\n"
+                    chunk += "\n/no_think\n"
+
+                response = self.client.chat.completions.create(
+                    extra_body={"enable_thinking": False},
+                    model=CONFIG["model"],
+                    messages=[
+                        {"role": "system", "content": prompt},
+                        {"role": "user", "content": chunk}
+                    ],
+                    temperature=0.3,
+                )
+                content = response.choices[0].message.content
+                return self._remove_thinking(content)
+            except Exception as e:
+                retry -= 1
+                if retry == 0:
+                    print("翻译失败: {e}，放弃重试。")
+                    return None
+
+                print(f"翻译出错: {e}, retrying... ({retry})")
+                time.sleep(2)
+
+    def translate_text(self, text: str, lang: str) -> str:
+        """使用openai接口翻译文本
+        Args:
+            text: 要翻译的文本
+            lang: 目标语言代码
+        """
+        chunks = self._split_into_chunks(text)
+        translated_chunks = []
+
+        for chunk in chunks:
+            translated_chunk = self._translate_chunk((chunk, lang))
+            if translated_chunk:
+                translated_chunks.append(translated_chunk)
+
+        return "\n\n".join(translated_chunks)
+
+    def process_file(self, filepath: str, lang: str = "en"):
+        """处理单个文件
+        Args:
+            filepath: 源文件路径
+            lang: 目标语言代码 (默认'en')
+        """
+        rel_path = os.path.relpath(filepath, CONFIG["source_dir"])
+        target_path = os.path.join(
+            CONFIG["source_dir"], "locales", lang, rel_path)
+
+        # 检查文件是否已存在且未修改
+        cache_key = self._get_cache_key(filepath, lang)
+        file_hash = hashlib.md5(open(filepath, "rb").read()).hexdigest()
+        target_file_exists = os.path.exists(target_path)
+        with self._cache_lock:
+            if cache_key in self._cache and self._cache[cache_key]["hash"] == file_hash and target_file_exists:
+                print(f"文件未修改，跳过: {rel_path} (语言: {lang})")
+                return
+
+        print(f"正在处理: {rel_path}")
+
+        # 读取文件内容
+        with open(filepath, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        # 保留特殊格式
+        content, preserved = self._preserve_special_format(content)
+
+        content = self._generate_unique_label_for_lang(content, lang)
+
+        # 分块翻译
+        translated_content = self.translate_text(
+            content, lang)
+        if not translated_content:
+            print(f"翻译失败！{filepath}")
+            self.fail_count += 1
+            return
+
+        # 恢复特殊格式
+        translated_content = self._restore_special_format(
+            translated_content, preserved)
+
+        # 创建目标目录
+        os.makedirs(os.path.dirname(target_path), exist_ok=True)
+
+        # 写入翻译结果
+        with open(target_path, "w", encoding="utf-8") as f:
+            # 添加翻译元数据
+            file_ext = os.path.splitext(filepath)[1]
+            template = CONFIG["meta_templates"].get(file_ext, "")
+
+            if template:
+                timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                original_path = os.path.relpath(filepath, CONFIG["source_dir"])
+                meta_content = template.format(
+                    note="{note}",
+                    model=CONFIG["model"],
+                    timestamp=timestamp,
+                    original_name=os.path.basename(filepath),
+                    original_path=original_path
+                )
+                translated_content = meta_content + translated_content
+
+            f.write(translated_content)
+            f.write("\n")
+
+        # 更新缓存
+        with self._cache_lock:
+            self._cache[cache_key] = {
+                "hash": file_hash,
+            }
+
+        self._save_cache()
+
+        print(f"文件 {rel_path} 已成功翻译为 {lang} 并保存到 {target_path}")
+
+    def add_language_root_title(self, lang):
+        """为每个语言的文档添加标题"""
+        lang_root_doc_path = os.path.join(
+            CONFIG["source_dir"], "locales", lang, "index.rst")
+
+        if not os.path.exists(lang_root_doc_path):
+            raise FileNotFoundError(f"未找到 {lang} 的标题文件: {lang_root_doc_path}")
+        print(f"正在为 {CONFIG['target_languages'][lang]} 添加标题...")
+
+        # Read existing content first
+        with open(lang_root_doc_path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        lang_v = CONFIG["target_languages"][lang]
+        if content.startswith(lang_v):
+            print(f"{lang_v} 的标题已存在，跳过...")
+            return
+
+        # Then write new content (this clears the file)
+        with open(lang_root_doc_path, "w", encoding="utf-8") as f:
+            f.write(
+                f"{lang_v}\n==========================================\n{content}")
+
+        print(f"标题已添加到 {lang_root_doc_path}")
+
+    def run(self):
+        """运行翻译流程"""
+
+        print("Collecting all files...")
+        all_files = []
+        for root, dirs, files in os.walk(CONFIG["source_dir"], topdown=True):
+            # 只在根目录应用排除逻辑
+            if root == CONFIG["source_dir"]:
+                dirs[:] = [d for d in dirs if d not in CONFIG["dirs_exclude"]]
+            for file in files:
+                if file.endswith((".rst", ".md")):
+                    all_files.append(os.path.join(root, file))
+        total_files = len(all_files)
+        print(
+            f"Total {total_files} files to translate in {len(CONFIG['target_languages'])} languages.")
+        total_tasks = total_files * len(CONFIG["target_languages"])
+
+        # 外层进度条：语言
+        lang_pbar = tqdm(CONFIG["target_languages"].items(),
+                         desc="Overall progress",
+                         unit="lang",
+                         position=0)
+
+        for lang_k, lang_v in lang_pbar:
+            lang_pbar.set_description(f"Translating to {lang_v}")
+
+            # 并行处理文件
+            from concurrent.futures import ThreadPoolExecutor, as_completed
+
+            # 包装处理函数便于调试之类的
+
+            def process_file_wrapper(file_path):
+                self.process_file(file_path, lang_k)
+                return file_path
+
+            # 创建线程池
+            with ThreadPoolExecutor(max_workers=CONFIG["max_workers"]) as executor:
+                # 提交所有文件处理任务
+                futures = [executor.submit(
+                    process_file_wrapper, path) for path in all_files]
+
+                # 创建进度条
+                file_pbar = tqdm(total=len(all_files),
+                                 desc=f"Files in {lang_v}",
+                                 unit="file",
+                                 position=1,
+                                 leave=False)
+
+                # 更新进度条
+                for future in as_completed(futures):
+                    file_pbar.update(1)
+                    future.result()  # 获取结果（如果有异常会在这里抛出）
+
+                file_pbar.close()
+            self.add_language_root_title(lang_k)
+
+        lang_pbar.close()
+        print(
+            f"\n翻译完成！ Succ: {total_tasks-self.fail_count}, Fail: {self.fail_count}")
+
+
+if __name__ == "__main__":
+    print("Starting translation process...")
+    print("WORKERS: ", CONFIG["max_workers"])
+    print("LANGUAGES: ", CONFIG["target_languages"])
+    print("SOURCE_DIR: ", CONFIG["source_dir"])
+    print("MODEL: ", CONFIG["model"])
+
+    translator = DocumentTranslator()
+    translator.run()