前提,安装命令行工具 huggingface-cli 或者 hf 或 modelscope 或 aistudio 等命令行工具以及 git.
HF_ENDPOINT=https://hf-mirror.com hf download deepseek-ai/DeepSeek-OCR --cache-dir ~/.cache/huggingface/hub
或
HF_ENDPOINT=https://hf-mirror.com huggingface-cli download --resume-download deepseek-ai/DeepSeek-OCR
注意
pip install modelscope
modelscope download --model deepseek-ai/DeepSeek-OCR --local_dir /workspace/model
或
pip install --upgrade aistudio-sdk
aistudio download --model ModelHub/DeepSeek-OCR --local_dir ./
还可以通过git
git lfs install
然后
git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-OCR.git
或
git clone https://cnb.cool/ai-models/deepseek-ai/DeepSeek-OCR /workspace/model
或
git clone https://git.aistudio.baidu.com/ModelHub/DeepSeek-OCR.git
如果能直连 github
git clone https://github.com/deepseek-ai/DeepSeek-OCR.git
将/workspace/model下的恢复到 ~/.huggingface/hub下,
HuggingFace 缓存目录结构
📍 路径: /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-OCR
📁 /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-OCR/
├── blobs/
│ ├── 04e5006bc8b904cca28c1c43286c81b5d00a5208f4b4f0de6357e3ff0e79ab4f (4.2KB)
│ ├── 0ae2fb6d1e5ae8cf100fc32f854830acd08c821a0a1f23a94a76588c222ddcf2 (37.1KB)
│ ├── 0fe7ba9aa6b967a90e4af40d43c8030cbdd3dbcfbb387b8907625d5e54f0dbbe (460B)
│ ├── 1169e7cdc28ff2fb6186556acb2175db148ad26a62097df4c45a17e523180d3f (6.2GB)
│ ├── 2fe88eacc470c34d00225151372d3770948864f3d9cfaae16afa15b2432d7793 (262.5KB)
│ ├── 3a709eb3fdb51cf6d8a546ffb8efe3c80bef61ae0183c2c8476a4c4a41efa3f1 (386.8KB)
│ ├── 5835e0b9e1942fe36df9123009d1d80a2b78ccbe6a2d535668a6561e3d4068b1 (39.2KB)
│ ├── 69184a0493d1fdac21c360f9f9eabc5b5af3188e0265df44cfaeafc44f9095e3 (80.3KB)
│ ├── 6ab21f29a4722e26fa28c8e0d4277591689a598df17cf6c712330e8f62b3fc7c (10.4KB)
│ ├── 81d08f7f33d9d39b95dd9b8162506659e6822d621b9829a208f3830c34c2b4d0 (210.9KB)
│ ├── 887e88e60e5833bc10a2cd7edb89ea7e6992abaae5e1550b027c611b8b8456f2 (114.1KB)
│ ├── a02f8fd5228c90256bb4f6554c34a579d48f909e5beb232dc4afad870b55a8b4 (9.5MB)
│ ├── a0cbe8464049da1f891b7a12676de06af4cb54c130995d42f71adc1c30c6e9f3 (162.0KB)
│ ├── ab4bd57ce17d62e39e0a39e739de1e407484f090f0b2c7e391312bca7a5b061a (801B)
│ ├── ac358c28898de6a71e84e0cb959fcc97fcb9c674edae13cd24286c17b90032d8 (2.6KB)
│ ├── b51d17cc7b282880006a162b06e3a5c4c0a566f3d3093f34a93450ca5a91bd74 (241.0KB)
│ ├── cd24b0cfc7b6c0b1b34bd1aa55bc385e746298fdd82410db6c0d4e0bf69085c0 (241.3KB)
│ ├── ec7b6ce89bcda643de1f43269ffa66a7b2e65dc3ed30e427958f776546b4ba03 (9.0KB)
│ └── f2c6c602815669d292889e5be8c802f2ed950653b77999b1584e8e6aed25d040 (1.1KB)
├── refs/
│ └── main (40B)
└── snapshots/
└── 9f30c71f441d010e5429c532364a86705536c53a/
├── .ipynb_checkpoints/
│ └── README-checkpoint.md → ../../../blobs/04e5006bc8b904cca28c1c43286c81b5d00a5208f4b4f0de6357e3ff0e79ab4f
├── LICENSE → ../../blobs/f2c6c602815669d292889e5be8c802f2ed950653b77999b1584e8e6aed25d040
├── README.md → ../../blobs/04e5006bc8b904cca28c1c43286c81b5d00a5208f4b4f0de6357e3ff0e79ab4f
├── assets/
│ ├── fig1.png → ../../../blobs/3a709eb3fdb51cf6d8a546ffb8efe3c80bef61ae0183c2c8476a4c4a41efa3f1
│ ├── show1.jpg → ../../../blobs/887e88e60e5833bc10a2cd7edb89ea7e6992abaae5e1550b027c611b8b8456f2
│ ├── show2.jpg → ../../../blobs/81d08f7f33d9d39b95dd9b8162506659e6822d621b9829a208f3830c34c2b4d0
│ ├── show3.jpg → ../../../blobs/cd24b0cfc7b6c0b1b34bd1aa55bc385e746298fdd82410db6c0d4e0bf69085c0
│ └── show4.jpg → ../../../blobs/2fe88eacc470c34d00225151372d3770948864f3d9cfaae16afa15b2432d7793
├── config.json → ../../blobs/ac358c28898de6a71e84e0cb959fcc97fcb9c674edae13cd24286c17b90032d8
├── configuration_deepseek_v2.py → ../../blobs/6ab21f29a4722e26fa28c8e0d4277591689a598df17cf6c712330e8f62b3fc7c
├── conversation.py → ../../blobs/ec7b6ce89bcda643de1f43269ffa66a7b2e65dc3ed30e427958f776546b4ba03
├── deepencoder.py → ../../blobs/0ae2fb6d1e5ae8cf100fc32f854830acd08c821a0a1f23a94a76588c222ddcf2
├── model-00001-of-000001.safetensors → ../../blobs/1169e7cdc28ff2fb6186556acb2175db148ad26a62097df4c45a17e523180d3f
├── model.safetensors.index.json → ../../blobs/b51d17cc7b282880006a162b06e3a5c4c0a566f3d3093f34a93450ca5a91bd74
├── modeling_deepseekocr.py → ../../blobs/5835e0b9e1942fe36df9123009d1d80a2b78ccbe6a2d535668a6561e3d4068b1
├── modeling_deepseekv2.py → ../../blobs/69184a0493d1fdac21c360f9f9eabc5b5af3188e0265df44cfaeafc44f9095e3
├── processor_config.json → ../../blobs/0fe7ba9aa6b967a90e4af40d43c8030cbdd3dbcfbb387b8907625d5e54f0dbbe
├── special_tokens_map.json → ../../blobs/ab4bd57ce17d62e39e0a39e739de1e407484f090f0b2c7e391312bca7a5b061a
├── tokenizer.json → ../../blobs/a02f8fd5228c90256bb4f6554c34a579d48f909e5beb232dc4afad870b55a8b4
└── tokenizer_config.json → ../../blobs/a0cbe8464049da1f891b7a12676de06af4cb54c130995d42f71adc1c30c6e9f3
================================================================================
目录结构说明
================================================================================
📁 blobs/ - 实际的文件内容,文件名是文件内容的SHA256哈希
📁 snapshots/
📁 refs/ - 引用文件,main指向最新的commit
🔹 模型总大小: ~6.3GB (safetensors文件)
🔹 所有其他文件都是小文件(KB级别)
🔹 符号链接使用相对路径,便于迁移
💾 缓存总大小: 6.23 GB
#!/usr/bin/env python3
"""
将git克隆的模型文件恢复到HuggingFace缓存格式
"""import os
import sys
import hashlib
import shutil
from pathlib import Pathdef compute_sha256(file_path):"""计算文件的SHA256哈希值"""sha256_hash = hashlib.sha256()with open(file_path, "rb") as f:for chunk in iter(lambda: f.read(4096), b""):sha256_hash.update(chunk)return sha256_hash.hexdigest()def restore_to_huggingface_cache(source_dir, target_repo):"""将git克隆的模型文件恢复到HuggingFace缓存格式Args:source_dir: git克隆的模型目录 (如 /workspace/model)target_repo: HuggingFace缓存目录名 (如 models--deepseek-ai--DeepSeek-OCR)"""# 设置路径hub_cache = Path("/root/.cache/huggingface/hub")target_path = hub_cache / target_repoblobs_path = target_path / "blobs"snapshots_path = target_path / "snapshots"print("=" * 60)print("Restoring Git Cloned Model to HuggingFace Cache")print("=" * 60)print(f"\nSource: {source_dir}")print(f"Target: {target_path}")print("=" * 60)# 创建目录blobs_path.mkdir(parents=True, exist_ok=True)snapshots_path.mkdir(parents=True, exist_ok=True)# 创建refs/main文件(包含最新的commit hash)refs_dir = target_path / "refs"refs_dir.mkdir(exist_ok=True)# 默认使用一个commit hash(这里用文件名作为示例)commit_hash = "9f30c71f441d010e5429c532364a86705536c53a"(refs_dir / "main").write_text(commit_hash)# 创建snapshot目录snapshot_dir = snapshots_path / commit_hashsnapshot_dir.mkdir(exist_ok=True)print("\nProcessing files...")print("-" * 60)source_path = Path(source_dir)processed_files = []# 处理所有文件(除了.git目录)for file_path in source_path.rglob("*"):if not file_path.is_file():continueif ".git" in str(file_path):continue# 计算相对路径rel_path = file_path.relative_to(source_path)# 计算SHA256print(f"Processing: {rel_path}")sha256 = compute_sha256(file_path)# 复制到blobs目录(使用SHA256作为文件名)blob_file = blobs_path / sha256if not blob_file.exists():shutil.copy2(file_path, blob_file)print(f" → Blob saved: {blob_file.name}")# 在snapshot目录创建symlink(相对路径)snapshot_file = snapshot_dir / rel_pathsnapshot_file.parent.mkdir(parents=True, exist_ok=True)# 计算相对路径rel_to_snapshot = os.path.relpath(blob_file, snapshot_file.parent)# 如果已存在,删除它if snapshot_file.exists():snapshot_file.unlink()# 创建symlinkos.symlink(rel_to_snapshot, snapshot_file)print(f" → Symlink created: {snapshot_file}")processed_files.append(str(rel_path))print("\n" + "=" * 60)print("✅ Restoration completed successfully!")print("=" * 60)print(f"\nTotal files processed: {len(processed_files)}")print(f"\nYou can now use:")print(f" from transformers import AutoModel")print(f' model = AutoModel.from_pretrained("deepseek-ai/DeepSeek-OCR")')print("=" * 60)return target_pathif __name__ == "__main__":# 配置路径SOURCE_DIR = "/workspace/model" # git克隆的目录TARGET_REPO = "models--deepseek-ai--DeepSeek-OCR" # HuggingFace缓存目录名try:result_path = restore_to_huggingface_cache(SOURCE_DIR, TARGET_REPO)print(f"\n✅ Files restored to: {result_path}")except KeyboardInterrupt:print("\n\n⚠️ Process interrupted by user")sys.exit(130)except Exception as e:print(f"\n❌ Error: {e}")import tracebacktraceback.print_exc()sys.exit(1)
参考
https://huggingface.co/deepseek-ai/DeepSeek-OCR
https://hf-mirror.com/
http://github.com/deepseek-ai/DeepSeek-OCR
https://aistudio.baidu.com/modelsdetail/38538/space
https://www.modelscope.cn/models/deepseek-ai/DeepSeek-OCR
https://cnb.cool/ai-models/deepseek-ai/DeepSeek-OCR