Merge pull request d-run#70 from windsonsea/script

add Scripts
windsonsea · Apr 19, 2024 · 416416b · 416416b
2 parents 52d0fb6 + c399265
commit 416416b
Show file tree

Hide file tree

Showing 7 changed files with 621 additions and 0 deletions.
diff --git a/scripts/generate_pdf.md b/scripts/generate_pdf.md
@@ -0,0 +1,59 @@
+# Batch Production of Word and PDF Files
+
+English | [中文](./generate_pdf_zh.md)
+
+## Conversion to Word
+
+### Generating Word Files for All Documents (Not Recommended)
+
+The following command can be used to batch generate a complete Word file that includes all documents.
+
+```bash
+cd /path/to/this/repo
+python3 scripts/md2doc-v2.py docs/zh/docs # Generate Word files
+```
+
+However, the resulting Word file will contain all the content from docs.daocloud.io, making it difficult to read.
+
+> Currently, this command generates a large number of Word files. Make sure not to commit them to GitHub.
+
+It is recommended to run the following command before submitting a PR to remove cached Word files:
+
+```bash
+find ./docs -type f -name "*.docx" ! -name "TiDBonHwameiStor.docx" | xargs rm -f
+```
+
+### Generating Word File for Specific Folder (Recommended)
+
+```bash
+cd /path/to/this/repo
+python3 scripts/md2doc-v2.py docs/zh/docs/kpanda # Generate Word files
+```
+
+> The above is an example to generate documentation for the `kpanda` folder.
+
+If there are any errors during export, you can try running the following command to install the required dependencies:
+
+```bash
+pip install python-docx
+brew install pandoc
+```
+
+## Conversion to PDF
+
+### Generating PDF Files for All Documents (Not Supported)
+
+Currently, generating a full PDF file for all documents fails due to the large size of the complete documentation.
+
+> Not supported at the moment.
+
+### Generating PDF File for Specific Folder (Recommended)
+
+1. Modify the [pdf.yaml](../docs/zh/pdf.yaml) file, only modifying 2 fields:
+   - `docs_dir` for the folder name to be read
+   - `output_path` for the location to export the PDF
+2. Run the `mkdocs build` command, ensuring that the poetry environment is properly configured
+   1. Use `poetry install` to install the dependencies
+   2. Run `poetry run mkdocs build -f pdf.yaml` to generate the PDF files
+
+The secondary development of the documentation site is based on an open-source project. Currently, the supported features are limited, but they will be improved in the future.
diff --git a/scripts/generate_pdf_zh.md b/scripts/generate_pdf_zh.md
@@ -0,0 +1,59 @@
+# 批量生产 Word 和 PDF 文件
+
+[English](./generate_pdf.md) | 中文
+
+## 转换为 Word
+
+### 生成所有文档的 Word 文件（不推荐）
+
+下方命令可以批量生成一个全量的 Word 文件，包含所有的文档。
+
+```bash
+cd /path/to/this/repo
+python3 scripts/md2doc-v2.py docs/zh/docs # 生成 Word 文件
+```
+
+但这样生成的 Word 文件包含了 docs.daocloud.io 所有的内容，不方便阅读。
+
+> 目前此命令会生成大量 Word 文件，注意不要提交到 GitHub 上。
+
+建议在提 PR 之前，运行以下命令，移除缓存的 Word 文件：
+
+```bash
+find ./docs -type f -name "*.docx" ! -name "TiDBonHwameiStor.docx" | xargs rm -f
+```
+
+### 生成具体项目的 Word 文件（推荐）
+
+```bash
+cd /path/to/this/repo
+python3 scripts/md2doc-v2.py docs/zh/docs/kpanda # 生成 Word 文件
+```
+
+> 以上为例，这里是生成单个 `kpanda` 目录的文档。
+
+如果导出时报错，可以尝试运行以下命令安装所需的依赖项：
+
+```bash
+pip install python-docx
+brew install pandoc
+```
+
+## 转换为 PDF
+
+### 生成所有文档的 PDF 文件（暂不支持）
+
+目前全部文档过大，所以生成全量文档的 PDF 文件会失败。
+
+> 暂不支持
+
+### 生成具体项目的 PDF 文件（推荐）
+
+1. 修改 [pdf.yaml](../docs/zh/pdf.yaml) 文件，只需修改 2 个字段：
+   - `docs_dir` 需要读取的文件夹名称
+   - `output_path` 导出 pdf 的位置
+2. 然后执行 `mkdocs build` 命令，此时需要配置好 poetry 环境
+   1. 使用 `poetry install` 安装依赖项
+   2. 运行 `poetry run mkdocs build -f pdf.yaml` 生成 PDF 文件
+
+文档站的二次开发以开源项目为基础，目前支持的功能尚有限，后续会继续完善。
diff --git a/scripts/lsync.sh b/scripts/lsync.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+
+# Copyright DaoCloud authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http:/www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script checks if the English version of a page has changed since a localized
+# page has been committed.
+# Instructions: in the repo directory, run `./scripts/lsync.sh docs/en/` to check
+# differences between en and zh.
+
+if [ "$#" -ne 1 ] ; then
+  echo -e "\nThis script checks if the Chinese version of a page has changed since a " >&2
+  echo -e "localized page has been committed.\n" >&2
+  echo -e "Usage:\n\t$0 <PATH>\n" >&2
+  echo -e "Example:\n\t$0 docs/en/docs/concepts/_index.md\n" >&2
+  exit 1
+fi
+
+# Check if path exists, and whether it is a directory or a file
+if [ ! -e "$1" ] ; then
+  echo "Path not found: '$1'" >&2
+  exit 2
+fi
+
+if [ -d "$1" ] ; then
+  SYNCED=1
+  for f in `find $1 -name "*.md"` ; do
+    ZH_VERSION=`echo $f | sed "s/docs\/.\{2,5\}\//docs\/zh\//g"`
+    if [ ! -e $ZH_VERSION ]; then
+      echo -e "**removed**\t$ZH_VERSION"
+      SYNCED=0
+      continue
+    fi
+
+    LASTCOMMIT=`git log -n 1 --pretty=format:%h -- $f`
+    git diff --exit-code --numstat $LASTCOMMIT...HEAD $ZH_VERSION
+    if [ $? -ne 0 ] ; then
+      SYNCED=0
+    fi
+  done
+  if [ $SYNCED -eq 1 ]; then
+    echo "$1 is still in sync"
+    exit 0
+  fi
+  exit 1
+fi
+
+LOCALIZED="$1"
+
+# Try get the Chinese version
+ZH_VERSION=`echo $LOCALIZED | sed "s/docs\/.\{2,5\}\//docs\/zh\//g"`
+if [ ! -e $ZH_VERSION ]; then
+  echo "$ZH_VERSION has been removed."
+  exit 3
+fi
+
+# Last commit for the localized path
+LASTCOMMIT=`git log -n 1 --pretty=format:%h -- $LOCALIZED`
+
+git diff --exit-code $LASTCOMMIT...HEAD $ZH_VERSION
+
+if [ "$?" -eq 0 ]; then
+  echo "$LOCALIZED is still in sync"
+  exit 0
+fi
diff --git a/scripts/md2doc-v2.py b/scripts/md2doc-v2.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+# requirements: pandoc (https://github.com/jgm/pandoc)
+# requirements: python-docx docxcompose
+
+import os, sys
+from docx import Document
+from docxcompose.composer import Composer
+
+
+def get_md_files(path, recursion):
+    '''获取指定路径下的所有.md文件'''
+    if not os.path.isdir(path):
+        print('请输出正确的路径')
+        exit(1)
+
+    if recursion:
+        files_name = []
+        for root, dirs, files in os.walk(path):
+            for file_name in files:
+                if file_name[-3:] == '.md':
+                    files_name.append(os.path.join(root, file_name))
+    else:
+        files_name = [i for i in os.listdir(path) if i[-3:] == ".md"]   # 不递归 仅当前目录
+
+    return files_name
+
+
+def main(path='docs/zh/docs', style='预留', result_path='docs/zh.docx'):
+    """分别转换指定目录下md文件内容为docx, 并合并至同一个docx文件中"""
+    path = os.path.abspath(path)
+    result_path = os.path.abspath(result_path)
+    if not os.path.exists(result_path):
+        os.system('touch ' + result_path)
+
+    files_name = get_md_files(path, recursion=True)
+    count_max = len(files_name)
+    for index, file_name in enumerate(files_name):
+        os.chdir(os.path.dirname(file_name))
+        print("正在转换", index + 1, "/", count_max, "当前文件:", file_name)
+        cmd = 'pandoc ' + file_name + ' -o ' + file_name[:-2] + 'docx'
+        try:
+            os.system(cmd)
+        except:
+            print("转换失败, 异常的文件:", file_name)
+
+    docx_list_name = [i[:-2]+'docx' for i in files_name]
+
+    style_demo = Document(docx_list_name[0])
+    new_docx = Composer(style_demo)
+    count_max = len(docx_list_name)
+    for index, word in enumerate(docx_list_name):
+        print("正在合并, 已添加", index + 1, "/", count_max, "个 当前读取的文件:", word)
+        if not os.path.exists(word):
+            print(word, "文件不存在, 请检查md转换记录, 已跳过该文件")
+            continue
+        word_document = Document(word)
+        if index != count_max -2:
+            word_document.add_page_break()
+        new_docx.append(word_document)
+        new_docx.save(result_path)
+
+
+
+if __name__ == '__main__':
+    main(sys.argv[1])
diff --git a/scripts/pr-report.py b/scripts/pr-report.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+#
+# Created on Tue Jan 16 15:17:40 2024
+# @author: FanLin
+#
+# 此脚本会导出 repo 下的所有 PR 记录，方便汇总统计
+#
+# 需要安装 request 和 pandas 库：
+# pip install requests
+# pip install pandas
+#
+# 默认导出到 repo 根目录
+
+import requests
+import pandas as pd
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+
+# 你的 GitHub 令牌
+token = "替换为你的 token"
+# 要查询的仓库名
+repo = "DaoCloud/DaoCloud-docs"
+# 指定要导出的日期范围
+start_date = "2023-01-01T00:00:00Z"
+end_date = "2023-01-31T23:59:59Z"
+# GitHub 的 API endpoint
+url = f"https://api.github.com/repos/{repo}/pulls"
+
+headers = {
+    "Authorization": f"token {token}",
+    "Accept": "application/vnd.github.v3+json",
+}
+
+params = {
+    "state": "all",  # 获取所有的PR
+    "sort": "created",
+    "direction": "desc",  # 从新到旧排序
+    "per_page": 100,  # 每页的结果数量
+}
+
+def get_pr_details(pr):
+    pr_url = pr["url"]
+    try:
+        pr_response = requests.get(pr_url, headers=headers)
+        pr_data = pr_response.json()
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching PR details for {pr_url}: {e}")
+        return None
+
+    changed_files = pr_data["changed_files"]
+    additions = pr_data["additions"]
+    deletions = pr_data["deletions"]
+
+    # Extract the names of all labels
+    labels = [label["name"] for label in pr["labels"]]
+
+    created_at_naive = datetime.strptime(pr["created_at"], "%Y-%m-%dT%H:%M:%SZ")
+
+    return {
+        "Date": created_at_naive,
+        "Author": pr["user"]["login"],
+        "Title": pr["title"],
+        "Labels": labels,
+        "Label Count": len(labels),
+        "Changed Files": changed_files,
+        "Additions": additions,
+        "Deletions": deletions,
+        "PR Link": pr["html_url"]
+    }
+
+def fetch_all_prs(url, headers, params, start_date, end_date):
+    df = pd.DataFrame(columns=["Date", "Author", "Title", "Labels", "Label Count", "Changed Files", "Additions", "Deletions", "PR Link"])
+    page = 1
+    with ThreadPoolExecutor(max_workers=10) as executor: 
+        while True:
+            params["page"] = page
+            response = requests.get(url, headers=headers, params=params)
+            data = response.json()
+            if not data:
+                break
+            futures = [executor.submit(get_pr_details, pr) for pr in data if pr["created_at"] >= start_date and pr["created_at"] <= end_date]
+            for future in futures:
+                result = future.result()
+                if result is not None:
+                    df = df.append(result, ignore_index=True)
+            page += 1
+    return df
+
+# 获取所有 PR
+df = fetch_all_prs(url, headers, params, start_date, end_date)
+
+df["Date"] = pd.to_datetime(df["Date"])
+df.set_index("Date", inplace=True)
+
+# Monthly PR details
+label_counts_monthly = df["Labels"].explode().value_counts()
+monthly_user_counts = df.groupby([df.index.year, df.index.month])['Author'].value_counts()
+
+# 替换保存路径
+with pd.ExcelWriter('PR_detail_2023.xlsx') as writer: 
+    df.to_excel(writer, sheet_name='PR Details')
+    label_counts_monthly.to_excel(writer, sheet_name='Label Counts')
+    monthly_user_counts.to_excel(writer, sheet_name='Monthly User Counts')
+
+print(f"Total PR count for January 2023: {len(df)}")