diff --git a/frameworks/transformers/5.5.0/Dockerfile b/frameworks/transformers/5.5.0/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..44701f4037f15104c81f4cde499cfad89fe98a2d --- /dev/null +++ b/frameworks/transformers/5.5.0/Dockerfile @@ -0,0 +1,20 @@ + +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="Torch 2.10.0 (GPU) on OpenCloudOS 9" +ENV NVIDIA_VISIBLE_DEVICES=all + +ENV NVIDIA_VISIBLE_DEVICES=all \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=120 + + +WORKDIR /home + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install "transformers[torch] @ git+https://github.com/huggingface/transformers.git@v5.5.0" + +RUN ["python3"] \ No newline at end of file diff --git a/frameworks/transformers/5.5.0/README.md b/frameworks/transformers/5.5.0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..86f19f6ee58d85afc4df1964fddd637d4f757cee --- /dev/null +++ b/frameworks/transformers/5.5.0/README.md @@ -0,0 +1,20 @@ +# Transformers 5.5.0 on OpenCloudOS 9 + +## 基本信息 +- **框架版本**:v5.5.0 +- **基础镜像**:opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**: 12.x 或 更高 + +## 构建 + +docker build -t oc9-transformers:5.5.0 . + +## 镜像启动命令 + +docker run -d --gpus all --name oc9-transformers oc9-transformers:5.5.0 + +## 镜像测试命令 + +docker run --rm --gpus all oc9-transformers:5.5.0 bash test.sh + diff --git a/frameworks/transformers/5.5.0/build.conf b/frameworks/transformers/5.5.0/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..347fa33f2090cfa71e8b61669fe582c3faf89527 --- /dev/null +++ b/frameworks/transformers/5.5.0/build.conf @@ -0,0 +1,4 @@ +# transformers 5.5.0 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-transformers +IMAGE_TAG=5.5.0 +GPU_TEST=true \ No newline at end of file diff --git a/frameworks/transformers/5.5.0/test.sh b/frameworks/transformers/5.5.0/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..14a9b3cca48726a87a47177059c1c50aeccb6abe --- /dev/null +++ b/frameworks/transformers/5.5.0/test.sh @@ -0,0 +1,233 @@ +#!/usr/bin bash +# 在容器外执行:验证指定 Docker 镜像中的 transformers / torch / CUDA 基础功能。 +# 用法:bash test_transformers_docker.sh [额外 docker run 参数...] +# 示例:bash test_transformers_docker.sh my-transformers:latest +# 示例:REQUIRE_CUDA=0 bash test_transformers_docker.sh my-transformers:latest +# 示例:DOCKER_NETWORK=bridge bash test_transformers_docker.sh my-transformers:latest + +set -Eeuo pipefail + +IMAGE="${1:-}" +if [[ -z "${IMAGE}" || "${IMAGE}" == "-h" || "${IMAGE}" == "--help" ]]; then + cat <<'USAGE' +用法: + bash test_transformers_docker.sh [额外 docker run 参数...] + +环境变量: + REQUIRE_CUDA=1|0 是否强制要求 CUDA/GPU 可用,默认 1 + GPUS=all 传给 docker run --gpus 的值,默认 all + PYTHON_BIN=python3 容器内 Python 命令,默认 python3 + DOCKER_NETWORK=none docker 网络模式,默认 none,避免测试时联网下载模型 + TIMEOUT_SECONDS=180 整体测试超时时间,默认 180 秒 + +示例: + bash test_transformers_docker.sh registry.example.com/ai/transformers:cu121 + REQUIRE_CUDA=0 bash test_transformers_docker.sh transformers-cpu:test + DOCKER_NETWORK=bridge bash test_transformers_docker.sh my-image:tag --ipc=host +USAGE + exit 1 +fi +shift || true + +REQUIRE_CUDA="${REQUIRE_CUDA:-1}" +GPUS="${GPUS:-all}" +PYTHON_BIN="${PYTHON_BIN:-python3}" +DOCKER_NETWORK="${DOCKER_NETWORK:-none}" +TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-180}" +EXTRA_DOCKER_ARGS=("$@") + +log() { printf '\033[1;34m%s\033[0m\n' "$*"; } +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } + +[[ "${REQUIRE_CUDA}" =~ ^[01]$ ]] || fail "REQUIRE_CUDA 只能是 1 或 0,当前值: ${REQUIRE_CUDA}" + +command -v docker >/dev/null 2>&1 || fail "未找到 docker 命令" +docker info >/dev/null 2>&1 || fail "docker daemon 不可用,请确认 Docker 服务已启动且当前用户有权限访问" + +if ! docker image inspect "${IMAGE}" >/dev/null 2>&1; then + warn "本地未找到镜像 ${IMAGE};docker run 可能会尝试拉取镜像" +fi + +DOCKER_ARGS=(run --rm -i) +if [[ "${REQUIRE_CUDA}" == "1" ]]; then + DOCKER_ARGS+=(--gpus "${GPUS}") +fi +DOCKER_ARGS+=( + --network "${DOCKER_NETWORK}" + -e "REQUIRE_CUDA=${REQUIRE_CUDA}" + -e "PYTHON_BIN=${PYTHON_BIN}" + -e "TRANSFORMERS_OFFLINE=1" + -e "HF_HUB_OFFLINE=1" + -e "HF_HOME=/tmp/hf-home" +) +DOCKER_ARGS+=("${EXTRA_DOCKER_ARGS[@]}") +DOCKER_ARGS+=("${IMAGE}" /bin/bash -s) + +log "=== Transformers Docker 镜像功能测试 ===" +printf '镜像: %s\n' "${IMAGE}" +printf 'CUDA 强制检查: %s\n' "${REQUIRE_CUDA}" +printf 'Docker 网络: %s\n' "${DOCKER_NETWORK}" +printf 'Python 命令: %s\n' "${PYTHON_BIN}" +if ((${#EXTRA_DOCKER_ARGS[@]} > 0)); then + printf '额外 docker 参数: %s\n' "${EXTRA_DOCKER_ARGS[*]}" +fi + +RUN_CMD=(docker "${DOCKER_ARGS[@]}") +if command -v timeout >/dev/null 2>&1; then + RUN_CMD=(timeout --preserve-status "${TIMEOUT_SECONDS}s" "${RUN_CMD[@]}") +fi + +"${RUN_CMD[@]}" <<'IN_CONTAINER' +set -Eeuo pipefail + +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } +section() { printf '\n\033[1;34m=== %s ===\033[0m\n' "$*"; } + +PY="${PYTHON_BIN:-python3}" + +if [[ -d /usr/local/cuda ]]; then + export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" + export PATH="${CUDA_HOME}/bin:${PATH}" + export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}" +fi + +export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}" +export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}" +export HF_HOME="${HF_HOME:-/tmp/hf-home}" +mkdir -p "${HF_HOME}" + +section "1. Python 基础检查" +command -v "${PY}" >/dev/null 2>&1 || fail "容器内未找到 Python 命令: ${PY}" +"${PY}" --version +ok "Python 可用" + +section "2. Python 包导入检查" +"${PY}" - <<'PY' +import importlib +import platform +import sys + +packages = ["torch", "transformers", "tokenizers"] +print("python_executable:", sys.executable) +print("platform:", platform.platform()) + +for name in packages: + mod = importlib.import_module(name) + version = getattr(mod, "__version__", "unknown") + print(f"{name}: {version}") +PY +ok "torch / transformers / tokenizers 导入正常" + +section "3. CUDA / GPU 检查" +if [[ "${REQUIRE_CUDA:-1}" == "1" ]]; then + command -v nvidia-smi >/dev/null 2>&1 || fail "nvidia-smi 不可用;请检查宿主机 NVIDIA 驱动、nvidia-container-toolkit、docker run --gpus 参数" + nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader || fail "nvidia-smi 执行失败" + + "${PY}" - <<'PY' +import torch + +assert torch.cuda.is_available(), "torch.cuda.is_available() 为 False" +print("torch_cuda_version:", torch.version.cuda) +print("gpu_count:", torch.cuda.device_count()) +print("gpu_0_name:", torch.cuda.get_device_name(0)) + +x = torch.randn(512, 512, device="cuda") +y = x @ x +torch.cuda.synchronize() +print("gpu_matmul_shape:", tuple(y.shape)) +PY + ok "CUDA / GPU 可用,且 torch GPU 计算正常" +else + warn "REQUIRE_CUDA=0,跳过强制 CUDA 检查" +fi + +section "4. Transformers 离线功能检查:tokenizer + model forward + save/load + pipeline" +"${PY}" - <<'PY' +import os +import tempfile +import torch +from transformers import ( + AutoModel, + BertConfig, + BertForMaskedLM, + BertTokenizerFast, + pipeline, +) + +require_cuda = os.environ.get("REQUIRE_CUDA", "1") == "1" +device = torch.device("cuda:0" if require_cuda else "cpu") + +with tempfile.TemporaryDirectory() as tmp_dir: + vocab_path = os.path.join(tmp_dir, "vocab.txt") + with open(vocab_path, "w", encoding="utf-8") as f: + f.write("\n".join([ + "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", + "hello", "world", "!", "transformers", "test", "good", + ])) + + tokenizer = BertTokenizerFast(vocab_file=vocab_path, do_lower_case=True) + encoded = tokenizer("hello world !", return_tensors="pt", padding=True, truncation=True) + + config = BertConfig( + vocab_size=tokenizer.vocab_size, + hidden_size=32, + num_hidden_layers=1, + num_attention_heads=4, + intermediate_size=64, + max_position_embeddings=64, + ) + + model = AutoModel.from_config(config).to(device).eval() + encoded_on_device = {k: v.to(device) for k, v in encoded.items()} + with torch.no_grad(): + outputs = model(**encoded_on_device) + + assert outputs.last_hidden_state.shape[0] == 1 + assert outputs.last_hidden_state.shape[-1] == 32 + print("forward_device:", str(device)) + print("tokenizer_vocab_size:", tokenizer.vocab_size) + print("last_hidden_state_shape:", tuple(outputs.last_hidden_state.shape)) + + save_dir = os.path.join(tmp_dir, "tiny-bert") + model.save_pretrained(save_dir) + tokenizer.save_pretrained(save_dir) + + loaded = AutoModel.from_pretrained(save_dir, local_files_only=True).to(device).eval() + with torch.no_grad(): + loaded_outputs = loaded(**encoded_on_device) + + assert loaded_outputs.last_hidden_state.shape == outputs.last_hidden_state.shape + print("local_save_load: ok") + + mlm = BertForMaskedLM(config).to(device).eval() + fill_mask = pipeline( + "fill-mask", + model=mlm, + tokenizer=tokenizer, + device=0 if device.type == "cuda" else -1, + top_k=1, + ) + + result = fill_mask("hello [MASK] !") + assert isinstance(result, list) and len(result) == 1 + assert "token_str" in result[0] + print("pipeline_fill_mask_token:", result[0]["token_str"]) +PY +ok "Transformers 离线核心功能正常" + +section "5. 可选组件提示" +if command -v nvcc >/dev/null 2>&1; then + nvcc --version | sed -n '1,4p' +else + warn "未检测到 nvcc:运行时镜像通常不需要 nvcc,只有编译 CUDA 扩展时才需要" +fi + +section "测试结果" +ok "所有检查通过" +IN_CONTAINER + +ok "宿主机侧 docker run 验证完成" \ No newline at end of file diff --git a/frameworks/transformers/5.5.0/test_result.png b/frameworks/transformers/5.5.0/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..b34c3c9b6d669deaef4c9c9fb2180fa18c280991 Binary files /dev/null and b/frameworks/transformers/5.5.0/test_result.png differ