HF模型语料下载姿势—huggingfaceCLI+Supervisor
huggingface_cli原生命令
挂载远程目录
# 挂载
mount.nfs 172.17.120.251:/volume1/Public /mnt/nas
安装环境依赖
sudo yum groupinstall "Development Tools" -y
sudo yum install openssl-devel libffi-devel bzip2-devel -y
wget https://www.python.org/ftp/python/3.9.16/Python-3.9.16.tgz
tar xf Python-3.9.16.tgz
cd Python-3.9.16
./configure --enable-optimizations
sudo make altinstall
# huggingface从0.23版本开始不支持断点续传,因此安装指定版本
pip install -U huggingface-hub==0.22.2
pip install urllib3==1.26.6
huggingface-cli下载命令
export HF_ENDPOINT=https://hf-mirror.com
huggingface-cli download \
--token hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxx \
--resume-download \
--repo-type dataset \
--local-dir bigcode_the-stack-dedup \
--cache-dir ./.cache/huggingface/ \
--local-dir-use-symlinks True \
bigcode/the-stack-dedup
# 注:local-dir-use-symlinks 控制是否软链接到缓存,如果False,断点续传失效
supervisor进程守护
安装supervisor
pip3.9 install supervisor
mkdir /etc/supervisor
echo_supervisord_conf > /etc/supervisor/supervisord.conf
supervisor配置文件
vim /etc/supervisor/supervisord.conf
[unix_http_server]
file=/tmp/supervisor.sock ; the path to the socket file
[inet_http_server] ; inet (TCP) server disabled by default
port=0.0.0.0:9001 ; ip_address:port specifier, *:port for all iface
[supervisord]
logfile=/tmp/supervisord.log ; main log file; default $CWD/supervisord.log
logfile_maxbytes=50MB ; max main logfile bytes b4 rotation; default 50MB
logfile_backups=10 ; # of main logfile backups; 0 means none, default 10
loglevel=info ; log level; default info; others: debug,warn,trace
pidfile=/tmp/supervisord.pid ; supervisord pidfile; default supervisord.pid
nodaemon=false ; start in foreground if true; default false
user=root
silent=false ; no logs to stdout if true; default false
minfds=1024 ; min. avail startup file descriptors; default 1024
minprocs=200 ; min. avail process descriptors;default 200
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
[supervisorctl]
serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket
[include]
files = /etc/supervisor/conf.d/*.conf
huggingface实例配置
vim /etc/supervisor/conf.d/huggingface.conf
[program:huggingface_cli]
command=sh /etc/supervisor/huggingface_cli.sh
priority=999 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true)
autorestart=true ; retstart at unexpected quit (default: true)
startsecs=10 ; number of secs prog must stay running (def. 10)
startretries=3 ; max # of serial start failures (default 3)
exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=TERM ; signal used to kill process (default TERM)
stopwaitsecs=10 ; max num secs to wait before SIGKILL (default 10)
user=root ; setuid to this UNIX account to run the program
log_stdout=true
log_stderr=true ; if true, log program stderr (def false)
logfile=/var/log/huggingface_cli_log.log
logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB)
logfile_backups=10 ; # of logfile backups (default 10)
stderr_logfile_maxbytes=2MB ; stdout 日志文件大小,默认 50MB
stderr_logfile_backups=20 ; stdout 日志文件备份数
stderr_logfile=/var/log/huggingface_cli.log
供supervisor调用的下载脚本
vim /etc/supervisor/huggingface_cli.sh
#!/bin/bash
export HF_ENDPOINT="https://hf-mirror.com"
TOKEN="hf_xxxxxxxxxxxxxxxxxxxxx"
REPO_TYPE="dataset"
REPO_NAME="liwu/MNBVC"
LOCAL_DIR="/mnt/nas/Yuliao/Downloaded-EN/MNBVC/"
CACHE_DIR="/mnt/nas/Yuliao/Downloaded-EN/.cache/huggingface"
huggingface-cli download \
$REPO_NAME \
--token $TOKEN \
--resume-download \
--local-dir-use-symlinks True \
--repo-type $REPO_TYPE \
--local-dir $LOCAL_DIR \
--cache-dir $CACHE_DIR \
web控制台
附:常规wget下载
# 断点续传并遍历下载nginx_autoindex文件服务器内所有文件
wget -c --recursive --no-parent --no-host-directories --reject "index.html*" http://ipaddress:8099/