diff --git a/scripts/generate_pdf.md b/scripts/generate_pdf.md new file mode 100644 index 0000000..93f6313 --- /dev/null +++ b/scripts/generate_pdf.md @@ -0,0 +1,59 @@ +# Batch Production of Word and PDF Files + +English | [中文](./generate_pdf_zh.md) + +## Conversion to Word + +### Generating Word Files for All Documents (Not Recommended) + +The following command can be used to batch generate a complete Word file that includes all documents. + +```bash +cd /path/to/this/repo +python3 scripts/md2doc-v2.py docs/zh/docs # Generate Word files +``` + +However, the resulting Word file will contain all the content from docs.daocloud.io, making it difficult to read. + +> Currently, this command generates a large number of Word files. Make sure not to commit them to GitHub. + +It is recommended to run the following command before submitting a PR to remove cached Word files: + +```bash +find ./docs -type f -name "*.docx" ! -name "TiDBonHwameiStor.docx" | xargs rm -f +``` + +### Generating Word File for Specific Folder (Recommended) + +```bash +cd /path/to/this/repo +python3 scripts/md2doc-v2.py docs/zh/docs/kpanda # Generate Word files +``` + +> The above is an example to generate documentation for the `kpanda` folder. + +If there are any errors during export, you can try running the following command to install the required dependencies: + +```bash +pip install python-docx +brew install pandoc +``` + +## Conversion to PDF + +### Generating PDF Files for All Documents (Not Supported) + +Currently, generating a full PDF file for all documents fails due to the large size of the complete documentation. + +> Not supported at the moment. + +### Generating PDF File for Specific Folder (Recommended) + +1. Modify the [pdf.yaml](../docs/zh/pdf.yaml) file, only modifying 2 fields: + - `docs_dir` for the folder name to be read + - `output_path` for the location to export the PDF +2. Run the `mkdocs build` command, ensuring that the poetry environment is properly configured + 1. Use `poetry install` to install the dependencies + 2. Run `poetry run mkdocs build -f pdf.yaml` to generate the PDF files + +The secondary development of the documentation site is based on an open-source project. Currently, the supported features are limited, but they will be improved in the future. diff --git a/scripts/generate_pdf_zh.md b/scripts/generate_pdf_zh.md new file mode 100644 index 0000000..3529ee7 --- /dev/null +++ b/scripts/generate_pdf_zh.md @@ -0,0 +1,59 @@ +# 批量生产 Word 和 PDF 文件 + +[English](./generate_pdf.md) | 中文 + +## 转换为 Word + +### 生成所有文档的 Word 文件(不推荐) + +下方命令可以批量生成一个全量的 Word 文件,包含所有的文档。 + +```bash +cd /path/to/this/repo +python3 scripts/md2doc-v2.py docs/zh/docs # 生成 Word 文件 +``` + +但这样生成的 Word 文件包含了 docs.daocloud.io 所有的内容,不方便阅读。 + +> 目前此命令会生成大量 Word 文件,注意不要提交到 GitHub 上。 + +建议在提 PR 之前,运行以下命令,移除缓存的 Word 文件: + +```bash +find ./docs -type f -name "*.docx" ! -name "TiDBonHwameiStor.docx" | xargs rm -f +``` + +### 生成具体项目的 Word 文件(推荐) + +```bash +cd /path/to/this/repo +python3 scripts/md2doc-v2.py docs/zh/docs/kpanda # 生成 Word 文件 +``` + +> 以上为例,这里是生成单个 `kpanda` 目录的文档。 + +如果导出时报错,可以尝试运行以下命令安装所需的依赖项: + +```bash +pip install python-docx +brew install pandoc +``` + +## 转换为 PDF + +### 生成所有文档的 PDF 文件(暂不支持) + +目前全部文档过大,所以生成全量文档的 PDF 文件会失败。 + +> 暂不支持 + +### 生成具体项目的 PDF 文件(推荐) + +1. 修改 [pdf.yaml](../docs/zh/pdf.yaml) 文件,只需修改 2 个字段: + - `docs_dir` 需要读取的文件夹名称 + - `output_path` 导出 pdf 的位置 +2. 然后执行 `mkdocs build` 命令,此时需要配置好 poetry 环境 + 1. 使用 `poetry install` 安装依赖项 + 2. 运行 `poetry run mkdocs build -f pdf.yaml` 生成 PDF 文件 + +文档站的二次开发以开源项目为基础,目前支持的功能尚有限,后续会继续完善。 diff --git a/scripts/lsync.sh b/scripts/lsync.sh new file mode 100755 index 0000000..fb6fdc7 --- /dev/null +++ b/scripts/lsync.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# + +# Copyright DaoCloud authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:/www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script checks if the English version of a page has changed since a localized +# page has been committed. +# Instructions: in the repo directory, run `./scripts/lsync.sh docs/en/` to check +# differences between en and zh. + +if [ "$#" -ne 1 ] ; then + echo -e "\nThis script checks if the Chinese version of a page has changed since a " >&2 + echo -e "localized page has been committed.\n" >&2 + echo -e "Usage:\n\t$0 \n" >&2 + echo -e "Example:\n\t$0 docs/en/docs/concepts/_index.md\n" >&2 + exit 1 +fi + +# Check if path exists, and whether it is a directory or a file +if [ ! -e "$1" ] ; then + echo "Path not found: '$1'" >&2 + exit 2 +fi + +if [ -d "$1" ] ; then + SYNCED=1 + for f in `find $1 -name "*.md"` ; do + ZH_VERSION=`echo $f | sed "s/docs\/.\{2,5\}\//docs\/zh\//g"` + if [ ! -e $ZH_VERSION ]; then + echo -e "**removed**\t$ZH_VERSION" + SYNCED=0 + continue + fi + + LASTCOMMIT=`git log -n 1 --pretty=format:%h -- $f` + git diff --exit-code --numstat $LASTCOMMIT...HEAD $ZH_VERSION + if [ $? -ne 0 ] ; then + SYNCED=0 + fi + done + if [ $SYNCED -eq 1 ]; then + echo "$1 is still in sync" + exit 0 + fi + exit 1 +fi + +LOCALIZED="$1" + +# Try get the Chinese version +ZH_VERSION=`echo $LOCALIZED | sed "s/docs\/.\{2,5\}\//docs\/zh\//g"` +if [ ! -e $ZH_VERSION ]; then + echo "$ZH_VERSION has been removed." + exit 3 +fi + +# Last commit for the localized path +LASTCOMMIT=`git log -n 1 --pretty=format:%h -- $LOCALIZED` + +git diff --exit-code $LASTCOMMIT...HEAD $ZH_VERSION + +if [ "$?" -eq 0 ]; then + echo "$LOCALIZED is still in sync" + exit 0 +fi diff --git a/scripts/md2doc-v2.py b/scripts/md2doc-v2.py new file mode 100644 index 0000000..87c7dde --- /dev/null +++ b/scripts/md2doc-v2.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# -*- coding: UTF-8 -*- +# requirements: pandoc (https://github.com/jgm/pandoc) +# requirements: python-docx docxcompose + +import os, sys +from docx import Document +from docxcompose.composer import Composer + + +def get_md_files(path, recursion): + '''获取指定路径下的所有.md文件''' + if not os.path.isdir(path): + print('请输出正确的路径') + exit(1) + + if recursion: + files_name = [] + for root, dirs, files in os.walk(path): + for file_name in files: + if file_name[-3:] == '.md': + files_name.append(os.path.join(root, file_name)) + else: + files_name = [i for i in os.listdir(path) if i[-3:] == ".md"] # 不递归 仅当前目录 + + return files_name + + +def main(path='docs/zh/docs', style='预留', result_path='docs/zh.docx'): + """分别转换指定目录下md文件内容为docx, 并合并至同一个docx文件中""" + path = os.path.abspath(path) + result_path = os.path.abspath(result_path) + if not os.path.exists(result_path): + os.system('touch ' + result_path) + + files_name = get_md_files(path, recursion=True) + count_max = len(files_name) + for index, file_name in enumerate(files_name): + os.chdir(os.path.dirname(file_name)) + print("正在转换", index + 1, "/", count_max, "当前文件:", file_name) + cmd = 'pandoc ' + file_name + ' -o ' + file_name[:-2] + 'docx' + try: + os.system(cmd) + except: + print("转换失败, 异常的文件:", file_name) + + docx_list_name = [i[:-2]+'docx' for i in files_name] + + style_demo = Document(docx_list_name[0]) + new_docx = Composer(style_demo) + count_max = len(docx_list_name) + for index, word in enumerate(docx_list_name): + print("正在合并, 已添加", index + 1, "/", count_max, "个 当前读取的文件:", word) + if not os.path.exists(word): + print(word, "文件不存在, 请检查md转换记录, 已跳过该文件") + continue + word_document = Document(word) + if index != count_max -2: + word_document.add_page_break() + new_docx.append(word_document) + new_docx.save(result_path) + + + +if __name__ == '__main__': + main(sys.argv[1]) \ No newline at end of file diff --git a/scripts/pr-report.py b/scripts/pr-report.py new file mode 100644 index 0000000..1cc02fc --- /dev/null +++ b/scripts/pr-report.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# +# Created on Tue Jan 16 15:17:40 2024 +# @author: FanLin +# +# 此脚本会导出 repo 下的所有 PR 记录,方便汇总统计 +# +# 需要安装 request 和 pandas 库: +# pip install requests +# pip install pandas +# +# 默认导出到 repo 根目录 + +import requests +import pandas as pd +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor + +# 你的 GitHub 令牌 +token = "替换为你的 token" +# 要查询的仓库名 +repo = "DaoCloud/DaoCloud-docs" +# 指定要导出的日期范围 +start_date = "2023-01-01T00:00:00Z" +end_date = "2023-01-31T23:59:59Z" +# GitHub 的 API endpoint +url = f"https://api.github.com/repos/{repo}/pulls" + +headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", +} + +params = { + "state": "all", # 获取所有的PR + "sort": "created", + "direction": "desc", # 从新到旧排序 + "per_page": 100, # 每页的结果数量 +} + +def get_pr_details(pr): + pr_url = pr["url"] + try: + pr_response = requests.get(pr_url, headers=headers) + pr_data = pr_response.json() + except requests.exceptions.RequestException as e: + print(f"Error fetching PR details for {pr_url}: {e}") + return None + + changed_files = pr_data["changed_files"] + additions = pr_data["additions"] + deletions = pr_data["deletions"] + + # Extract the names of all labels + labels = [label["name"] for label in pr["labels"]] + + created_at_naive = datetime.strptime(pr["created_at"], "%Y-%m-%dT%H:%M:%SZ") + + return { + "Date": created_at_naive, + "Author": pr["user"]["login"], + "Title": pr["title"], + "Labels": labels, + "Label Count": len(labels), + "Changed Files": changed_files, + "Additions": additions, + "Deletions": deletions, + "PR Link": pr["html_url"] + } + +def fetch_all_prs(url, headers, params, start_date, end_date): + df = pd.DataFrame(columns=["Date", "Author", "Title", "Labels", "Label Count", "Changed Files", "Additions", "Deletions", "PR Link"]) + page = 1 + with ThreadPoolExecutor(max_workers=10) as executor: + while True: + params["page"] = page + response = requests.get(url, headers=headers, params=params) + data = response.json() + if not data: + break + futures = [executor.submit(get_pr_details, pr) for pr in data if pr["created_at"] >= start_date and pr["created_at"] <= end_date] + for future in futures: + result = future.result() + if result is not None: + df = df.append(result, ignore_index=True) + page += 1 + return df + +# 获取所有 PR +df = fetch_all_prs(url, headers, params, start_date, end_date) + +df["Date"] = pd.to_datetime(df["Date"]) +df.set_index("Date", inplace=True) + +# Monthly PR details +label_counts_monthly = df["Labels"].explode().value_counts() +monthly_user_counts = df.groupby([df.index.year, df.index.month])['Author'].value_counts() + +# 替换保存路径 +with pd.ExcelWriter('PR_detail_2023.xlsx') as writer: + df.to_excel(writer, sheet_name='PR Details') + label_counts_monthly.to_excel(writer, sheet_name='Label Counts') + monthly_user_counts.to_excel(writer, sheet_name='Monthly User Counts') + +print(f"Total PR count for January 2023: {len(df)}") diff --git a/scripts/translate_md.py b/scripts/translate_md.py new file mode 100644 index 0000000..73e18d1 --- /dev/null +++ b/scripts/translate_md.py @@ -0,0 +1,161 @@ +# /usr/bin/env python3 +# -*- coding: UTF-8 -*- + + +""" + +Author: samzong.lu +E-mail: samzong.lu@gmail.com + +""" +import os +import argparse + +from googletrans import Translator + +from config import Config + +# 初始化代理配置 +os.environ['all_proxy'] = Config.ALL_PROXY + +translator = Translator() + + +# 获取当前路径下全部需要翻译的 markdown 文档 +def find_md_file(): + file_list = [] + for root, dirs, files in os.walk(os.getcwd()): + for file in files: + if file.endswith('.md'): + full_file = os.path.join(root, file) + file_list.append(full_file) + + return file_list + + +# copy 当前路径下全部需要的图片 +def copy_img_file(): + for root, dirs, files in os.walk(os.getcwd()): + for file in files: + if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'): + full_file = os.path.join(root, file) + filepath, filename = os.path.split(full_file) + save_to_path = filepath.replace('/DaoCloud-docs/docs/zh/', '/DaoCloud-docs/docs/en/') + if not os.path.exists(save_to_path): + os.makedirs(save_to_path) + new_img_file = os.path.join(save_to_path, filename) + print(new_img_file) + os.system('cp {} {}'.format(full_file, new_img_file)) + + +# 全量翻译,任务执行脚本 +def translate_md(markdown_file): + filepath, filename = os.path.split(markdown_file) + new_filepath = filepath.replace('/DaoCloud-docs/docs/zh/', '/DaoCloud-docs/docs/en/') + + if not os.path.exists(new_filepath): + os.makedirs(new_filepath) + + new_md_file = os.path.join(new_filepath, filename) + + # 读取 markdown 文件 + with open(markdown_file, 'r') as f: + text = f.read() + + # 获取翻译后文本 + tran_text = translator.translate(text, dest='en').text + + # 写入翻译后的 markdown 文件 + with open(new_md_file, 'w') as f: + f.write(tran_text) + + +# 全量翻译 +def full_translate(): + md_file_list = find_md_file() + for md_file in md_file_list: + print(md_file) + + if md_file in ['/Users/samzonglu/Git/daocloud/DaoCloud-docs/docs/zh/docs/dce/terms.md', + '/Users/samzonglu/Git/daocloud/DaoCloud-docs/docs/zh/docs/native/open.md', + '/Users/samzonglu/Git/daocloud/DaoCloud-docs/docs/zh/docs/native/knowledge.md']: + continue + + try: + translate_md(md_file) + except Exception as e: + print(e) + break + + copy_img_file() + + +# 翻译单个文件 +def translate_file(file): + if file.endswith('.md'): + # 准备翻译后的 markdown 文件 + filename, ext = os.path.splitext(file) + newfile = filename + '_translated' + ext + + # 读取 markdown 文件 + with open(file, 'r') as f: + text = f.read() + + # 获取翻译后文本 + trans_text = translator.translate(text, dest='en').text + + # 写入翻译后的 markdown 文件 + with open(newfile, 'w') as f: + f.write(trans_text) + else: + print("不支持的文件类型,目前仅支持 markdown 文件") + + +# 翻译特定文件夹 +def translate_folder(folder): + for root, dirs, files in os.walk(folder): + new_root = os.path.join(root, '_translated') + if not os.path.exists(new_root): + os.makedirs(new_root) + + for file in files: + if file.endswith('.md'): + md_file = os.path.join(root, file) + + print(md_file) + + with open(md_file, 'r') as f: + text = f.read() + + trans_text = translator.translate(text, dest='en').text + + new_md_file = os.path.join(new_root, file) + + with open(new_md_file, 'w') as f: + f.write(trans_text) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='翻译 markdown 文件,命令行使用注意事项:\n') + parser.add_argument('--file', metavar='file=', type=str, nargs='+', + help='需要翻译的文件') + parser.add_argument('--folder', metavar='folder=', type=str, nargs='+', + help='需要翻译的文件文件夹') + parser.add_argument('--full_translate', metavar='full_translate', type=bool, nargs='+', + help='翻译全部文档,这里需要切换到路径 Daocloud-docs/docs/zh/ 下执行') + + args = parser.parse_args() + + if args.file: + file = args.file[0] + print(file) + translate_file(file) + if args.folder: + folder = args.folder[0] + print(folder) + translate_folder(folder) + if args.full_translate: + if args.full_translate[0]: + full_translate() + + print(args) diff --git a/scripts/word_count.py b/scripts/word_count.py new file mode 100644 index 0000000..f9383f2 --- /dev/null +++ b/scripts/word_count.py @@ -0,0 +1,94 @@ +#! /usr/bin/python3 +# -*- coding: utf-8 -*- + +import string +import os +import io +import re + +def str_count(s): + count_en = count_dg = count_sp = count_zh = count_pu = 0 + s_len = len(s) + for c in s: + # 统计英文 + if c in string.ascii_letters: + count_en += 1 + # 统计数字 + elif c.isdigit(): + count_dg += 1 + # 统计空格 + elif c.isspace(): + count_sp += 1 + # 统计中文 + elif c.isalpha(): + count_zh += 1 + # 统计特殊字符 + else: + count_pu += 1 + total_chars = count_zh + count_en + count_sp + count_dg + count_pu + if total_chars == s_len: + return ('总字数:{0},中文字数:{1},英文字数:{2},空格:{3},数字数:{4},标点符号:{5}'.format(s_len, count_zh, count_en, count_sp, count_dg, count_pu)) + + +class MarkdownCounter: + def __init__(self, filename): + self.filename = filename + self.__zh_pattern = u"[\u4e00-\u9fa5]" + self.__zh_punctuation = u"[\u3000-\u303f\ufb00-\ufffd]" + self.__en_pattern = u"[A-Za-z]" + self.__digital_pattern = u"[0-9]" + self.__whitespace = u"[ \t\n\r\f\v]" + self.__others_pattern = "(?!" + self.__zh_pattern + "|" + self.__zh_punctuation + "|" + self.__en_pattern + "|" + self.__digital_pattern + "|" + self.__whitespace + ")" + + def __read_file(self): + with io.open(self.filename, mode='r', encoding='utf-8') as md_file: + self.content = md_file.read() + + def count_words(self): + self.__read_file() + unicode_content = self.content + re.split + zh_content = re.findall(self.__zh_pattern, unicode_content) + zh_punc_content = re.findall(self.__zh_punctuation, unicode_content) + en_content = re.findall(self.__en_pattern, unicode_content) + dig_content = re.findall(self.__digital_pattern, unicode_content) + whitespace_content = re.findall(self.__whitespace, unicode_content) + others_content = re.findall(self.__others_pattern, unicode_content) + self.zh_len, self.zh_punc_len, self.en_len, self.digital_len, self.whitespace_len, self.others_len = len(zh_content), len(zh_punc_content), len(en_content), len(dig_content), len( + whitespace_content), len(others_content) + + +if __name__ == "__main__": + print("markdown word counter!") + print(os.getcwd()) + + # sample file 'README.md' + # with io.open("README_zh.md", mode='r', encoding='utf-8') as md_file: + # buffer = md_file.read() + # out = str_count(buffer) + # buffer_unicode = buffer.encode('utf-8') + + # counter = MarkdownCounter("README_zh.md") + # counter.count_words() + # print(counter.content.encode('utf-8')) + # print("中文: {}, 中文标点: {}, 英文: {}, 数字: {}, 空格: {}, 其他: {}".format(counter.zh_len, counter.zh_punc_len, counter.en_len, counter.digital_len, counter.whitespace_len, counter.others_len)) + + + # all files + all_files_count_zh = all_files_count_en = 0 + + for root,dirs,files in os.walk(os.getcwd()): + for file in files: + if file.endswith('.md'): + file = root + '/' + file + with io.open(file, mode='r', encoding='utf-8') as md_file: + buffer = md_file.read() + out = str_count(buffer) + buffer_unicode = buffer.encode('utf-8') + + counter = MarkdownCounter(file) + counter.count_words() + all_files_count_zh += counter.zh_len + all_files_count_en += counter.en_len + + print('全部中文字符数:',all_files_count_zh,'全部en字符数:', all_files_count_en) \ No newline at end of file