赞
踩
此次安装部署均在Esxi虚拟机上运行。系统采用通用稳定的centos7系统,移植到其他(linux)系统应该问题不大。软件服务器的Esxi虚拟机的创建部分就跳过了.
IP | 主机名 | 配置 | 备注 |
---|---|---|---|
192.168.0.23 | master | 4C/8G/80G | 管理、存储节点 |
192.168.0.24 | node01 | 4C/8G/80G | 计算节点 |
192.168.0.25 | node01 | 4C/8G/80G | 计算节点 |
#!/bin/bash #服务器一键优化工具 function define_check_network() { echo 主机名为`hostname -f` ping www.baidu.com -c 6 } function define_yum () { #关闭selinux sed -i '/SELINUX/s/enforcing/disabled/' /etc/selinux/config #常用软件安装 yum clean all yum -y install bash vim wget curl sysstat gcc gcc-c++ make lsof sudo unzip openssh-clients net-tools systemd rpm yum rsyslog logrotate crontabs python-libs centos-release p7zip file # yum -y update && yum -y upgrade } function define_tuning_services() { #关闭多余服务 systemctl stop postfix firewalld chronyd cups #停止开机自启动 systemctl disable postfix firewalld chronyd cups echo "非关键系统服务已经关闭" } function define_tuning_kernel () { #4.内核参数优化 echo "内核参数优化" cp /etc/sysctl.conf /etc/sysctl.conf.bak cat /dev/null > /etc/sysctl.conf cat >> /etc/sysctl.conf << EOF ##内核默认参数 kernel.sysrq = 0 kernel.core_uses_pid = 1 kernel.msgmnb = 65536 kernel.msgmax = 65536 kernel.shmmax = 68719476736 kernel.shmall = 4294967296 kernel.sem=500 64000 64 256 ##打开文件数参数(20*1024*1024) fs.file-max= 20971520 ##WEB Server参数 net.ipv4.tcp_tw_reuse=1 net.ipv4.tcp_tw_recycle=1 net.ipv4.tcp_fin_timeout=30 net.ipv4.tcp_keepalive_time=1200 net.ipv4.ip_local_port_range = 1024 65535 net.ipv4.tcp_rmem=4096 87380 8388608 net.ipv4.tcp_wmem=4096 87380 8388608 net.ipv4.tcp_max_syn_backlog=8192 net.ipv4.tcp_max_tw_buckets = 5000 ##TCP补充参数 net.ipv4.ip_forward = 1 net.ipv4.conf.default.rp_filter = 1 net.ipv4.conf.default.accept_source_route = 0 net.ipv4.tcp_syncookies = 1 net.ipv4.tcp_sack = 1 net.ipv4.tcp_window_scaling = 1 net.core.wmem_default = 8388608 net.core.rmem_default = 8388608 net.core.rmem_max = 16777216 net.core.wmem_max = 16777216 net.core.netdev_max_backlog = 262144 net.core.somaxconn = 65535 net.ipv4.tcp_max_orphans = 3276800 net.ipv4.tcp_timestamps = 0 net.ipv4.tcp_synack_retries = 1 net.ipv4.tcp_syn_retries = 1 net.ipv4.tcp_mem = 94500000 915000000 927000000 ##禁用ipv6 net.ipv6.conf.all.disable_ipv6 =1 net.ipv6.conf.default.disable_ipv6 =1 ##swap使用率优化 vm.swappiness=0 EOF echo "系统参数设置OK" } function define_tuning_system () { #加一个防呆判断 if [ ` cat /etc/fstab |grep noatime|wc -l` = 0 ]; then echo "脚本首次执行" else echo "第二次执行脚本,请手动检查错误" exit 1 fi #磁盘IO优化 sed -i '/xfs/s/defaults/defaults,noatime/' /etc/fstab ##nproc设置仅适合centos6 #sed -i 's/1024/65535/' /etc/security/limits.d/90-nproc.conf ##nproc设置仅适合centos7 sed -i 's/4096/524288/' /etc/security/limits.d/20-nproc.conf #管理open files数量 echo "* soft nofile 1024000" >> /etc/security/limits.conf echo "* hard nofile 1024000" >> /etc/security/limits.conf #管理最大进程数 echo "* soft nproc 1024000" >> /etc/security/limits.conf echo "* hard nproc 1024000" >> /etc/security/limits.conf echo "session required /lib64/security/pam_limits.so" >> /etc/pam.d/login #全局变量设置优化 echo 'export TMOUT=600' >> /etc/profile echo 'export TIME_STYLE="+%Y/%m/%d %H:%M:%S"' >> /etc/profile echo 'export HISTTIMEFORMAT="%F %T `whoami` "' >> /etc/profile echo 'unset MAILCHECK' >> /etc/profile sed -i '/HISTSIZE/s/1000/12000/' /etc/profile source /etc/profile #关闭日志无效输出 echo 'if $programname == "systemd" and ($msg contains "Starting Session" or $msg contains "Started Session" or $msg contains "Created slice" or $msg contains "Starting user-" or $msg contains "Starting User Slice of" or $msg contains "Removed session" or $msg contains "Removed slice User Slice of" or $msg contains "Stopping User Slice of") then stop' >/etc/rsyslog.d/ignore-systemd-session-slice.conf systemctl restart rsyslog #权限优化 # echo 'umask 0022' >> /etc/profile #禁止Ctrl+Alt+Del重启 rm -rf /usr/lib/systemd/system/ctrl-alt-del.target #修改运行级别 systemctl set-default multi-user.target #关闭hugepage chmod +x /etc/rc.d/rc.local echo "echo never > /sys/kernel/mm/transparent_hugepage/enabled;" >> /etc/rc.d/rc.local echo "echo never > /sys/kernel/mm/transparent_hugepage/defrag;" >> /etc/rc.d/rc.local ##启用日志压缩 sed -i 's/'#compress'/'compress'/' /etc/logrotate.conf ## ssh弱密码算法修复 echo "Ciphers aes128-ctr,aes192-ctr,aes256-ctr" >> /etc/ssh/sshd_config ## 限制journal大小 echo "SystemMaxUse=2048M" >> /etc/systemd/journald.conf echo "ForwardToSyslog=no" >> /etc/systemd/journald.conf echo "MaxFileSec=14day" >> /etc/systemd/journald.conf systemctl restart systemd-journald.service #系统别名设置 cat >> /etc/bashrc << EOF ##系统别名设置 alias vi='vim' alias ls='ls -trlh --color=auto' alias grep='grep --color=auto' EOF source /etc/bashrc echo '系统别名设置完成' } function define_ntpdate1 () { #本地时间同步 yum -y install ntpdate echo "/usr/sbin/ntpdate -us ntp1.aliyun.com;hwclock -w;" >> /etc/rc.d/rc.local ##时区校正 timedatectl set-timezone Asia/Shanghai ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && hwclock -w ##时间同步 /usr/sbin/ntpdate -us ntp1.aliyun.com;hwclock -w; cat >> /var/spool/cron/root << EOF ##时间同步 0-59/20 * * * * /usr/sbin/ntpdate -us ntp1.aliyun.com;hwclock -w; EOF } function define_update () { ## yum update cat >> /var/spool/cron/root << EOF #yum update software 45 00 * * * /usr/bin/yum -y install bash sudo ntpdate openssh openssl vim systemd rpm yum rsyslog logrotate crontabs curl; > /dev/null 2>&1; EOF } function denfine_swap () { cat >> /var/spool/cron/root << EOF ## swap enable/disable 15 * * * * /usr/sbin/swapoff -a && /usr/sbin/swapon -a; EOF } function define_localhost () { define_yum define_tuning_services define_tuning_kernel define_tuning_system define_ntpdate1 define_update denfine_swap } function define_exit () { echo '' > /tmp/one_key.sh exit } while : do echo "" echo "服务器一键优化脚本" echo "" echo "" echo " 0) 检查服务器网络 1) 本地环境专用" echo " 2) 退出脚本" echo read -p "请输入一个选项: " opmode echo case ${opmode} in 0) define_check_network;; 1) define_localhost;; 2) define_exit;; *) echo "无效输入" ;; esac
hostnamectl set-hostname master
hostnamectl set-hostname node01
hostnamectl set-hostname node02
主节点上设置
vim /etc/hosts
//添加如下内容
192.168.0.23 master
192.168.0.24 node01
192.168.0.25 node02
//按:wq保存
将hosts文件scp到其他两个节点
scp -r /etc/hosts root@node01:/etc/ 输入密码
scp -r /etc/hosts root@node02:/etc/ 输入密码
//一路敲回车生成密钥
ssh-keygen -t rsa
//发一份密钥给自己
ssh-copy-id -i master
//发送到其他两个节点
ssh-copy-id -i node01
ssh-copy-id -i node02
ssh node01
ssh node02
yum install -y rpm-build bzip2-devel openssl openssl-devel zlib-devel perl-DBI perl-ExtUtils-MakeMaker pam-devel readline-devel mariadb-devel python3 gtk2 gtk2-devel gcc make perl-ExtUtils* perl-Switch lua-devel hwloc-devel
//查看防火墙状态
systemctl status firewalld
//禁用防火墙
systemctl stop firewalld
//开机禁止防火墙
systemctl disable firewalld
//部署chrony软件 yum install chrony -y //修改配置文件 vim /etc/chrony.conf //添加以下内容,把原来的server centos开头的删掉 server time1.aliyun.com iburst server time2.aliyun.com iburst server time3.aliyun.com iburst server time4.aliyun.com iburst server time5.aliyun.com iburst server time6.aliyun.com iburst server time7.aliyun.com iburst //启动、开机自启动、查看状态 systemctl start chronyd systemctl enable chronyd systemctl status chronyd //查看时间同步情况 chronyc sources 210 Number of sources = 1 MS Name/IP address Stratum Poll Reach LastRx Last sample =============================================================================== ^* 203.107.6.88 2 6 377 12 -760us[-1221us] +/- 22ms
yum install nfs-utils rpcbind -y
rpm -qa nfs-utils rpcbind
mkdir -p /public
chmod 755 /public
//打开配置文件
vim /etc/exports
//加入以下内容
/public *(rw,sync,insecure,no_subtree_check,no_root_squash)
//启动服务
systemctl enable nfs
systemctl enable rpcbind
systemctl start nfs
systemctl start rpcbind
systemctl status nfs
//查看配置文件是否正常
[root@master ~]# showmount -e localhost
Export list for localhost:
/public *
//部署命令 yum install nfs-utils rpcbind -y //设置开机启动nfs systemctl enable nfs //设置开机启动rpc协议 systemctl enable rpcbind //启动nfs服务 systemctl start nfs //启动rpc服务 systemctl start rpcbind //创建文件位置 mkdir /public //开启开机自动挂载 vim /etc/fstab //添加以下内容 192.168.0.23:/public /public nfs rw,sync 0 0 //挂载 mount -a
node01节点 [root@node01 ~]# df -Th Filesystem Type Size Used Avail Use% Mounted on /dev/mapper/centos-root xfs 79G 1.7G 78G 3% / devtmpfs devtmpfs 3.9G 0 3.9G 0% /dev tmpfs tmpfs 3.9G 0 3.9G 0% /dev/shm tmpfs tmpfs 3.9G 8.9M 3.9G 1% /run tmpfs tmpfs 3.9G 0 3.9G 0% /sys/fs/cgroup /dev/sda1 xfs 1014M 133M 882M 14% /boot tmpfs tmpfs 799M 0 799M 0% /run/user/0 192.168.0.23:/public nfs4 79G 1.7G 78G 3% /public node02节点 [root@node02 ~]# df -Th Filesystem Type Size Used Avail Use% Mounted on /dev/mapper/centos-root xfs 79G 1.7G 78G 3% / devtmpfs devtmpfs 3.9G 0 3.9G 0% /dev tmpfs tmpfs 3.9G 0 3.9G 0% /dev/shm tmpfs tmpfs 3.9G 8.9M 3.9G 1% /run tmpfs tmpfs 3.9G 0 3.9G 0% /sys/fs/cgroup /dev/sda1 xfs 1014M 133M 882M 14% /boot tmpfs tmpfs 799M 0 799M 0% /run/user/0 192.168.0.23:/public nfs4 79G 1.7G 78G 3% /public
网络信息服务(Network Information Service,NIS)是一种用于集中管理网络中用户和组信息、主机名、邮件别名等数据的协议。通过NIS,系统管理员可以在一个中心位置管理用户账户和配置文件,简化了跨多个系统的用户管理工作,提高了网络的可维护性和安全性。NIS常用于需要集中管理用户和资源的大型网络环境中。
在SLURM环境中,NIS通过集中管理用户账户和权限,确保在整个计算集群中用户身份的一致性,简化了用户管理,减少了维护成本。
//部署nis服务
yum install ypserv ypbind yp-tools rpcbind -y
//设置nisdomainname临时域名
nisdomainname steven.com
//设置永久域名
echo NISDOMAIN=steven.com>> /etc/sysconfig/network
//为了使每次重启之后域名都能生效需要将nisdomainname命令添加到rc.local文件中
echo /usr/bin/nisdomainname steven.com >> /etc/rc.d/rc.local
//添加需要解析IP段
vim /etc/ypserv.conf添加
192.168.0.0/24 : * : * : none
systemctl start ypserv
systemctl start yppasswdd
systemctl start rpcbind
//设置系统启动的时候自动启动,需要使用system enable命令手动进行设置:
systemctl enable ypserv
systemctl enable yppasswdd
systemctl enable rpcbind
/usr/lib64/yp/ypinit –m
//每次修改用户后,更新数据库
make -C /var/yp
systemctl restart rpcbind
systemctl restart yppasswdd
systemctl restart ypserv
yum install ypbind yp-tools rpcbind -y
echo NISDOMAIN=steven.com>> /etc/sysconfig/network
echo /usr/bin/nisdomainname steven.com >> /etc/rc.d/rc.local
echo domain steven.com server master >> /etc/yp.conf
echo ypserver master >> /etc/yp.conf
systemctl start ypbind
systemctl enable ypbind
ypwhich
ypwhich -x
yptest
yptest中的用例3测试不通过没有关系。
至此,NIS已经初步配置完成。
MUNGE(MUNGE Uid ‘N’ Gid Emporium)是一种用于认证的工具,特别设计用于在分布式系统中验证用户的身份。MUNGE通过生成和验证带有加密签名的认证令牌,确保只有被授权的用户和进程才能访问系统资源。这种轻量级的认证机制具有高效、安全、易于配置的特点,广泛应用于高性能计算和集群环境中。
在SLURM环境中,MUNGE起到了至关重要的认证作用。SLURM利用MUNGE来验证提交作业的用户身份,确保只有合法用户才能提交和管理作业。通过这种机制,SLURM能够有效地防止未经授权的访问,保护计算资源和数据的安全。同时,MUNGE的高效性保证了认证过程不会成为系统性能的瓶颈,使得SLURM能够在大规模集群中高效运行。
groupadd -g 1108 munge
useradd -m -c "Munge Uid 'N' Gid Emporium" -d /var/lib/munge -u 1108 -g munge -s /sbin/nologin munge
//查看主节点
id munge
//在主节点上查看node节点
for node in node01 node02; do echo "$node: ";ssh root@$node "id munge"; done
//在管理节点上,生成熵池
yum install -y rng-tools
rngd -r /dev/urandom
//修改service参数
vim /usr/lib/systemd/system/rngd.service
//修改如下:
[service]
ExecStart=/sbin/rngd -f -r /dev/urandom
systemctl daemon-reload
systemctl start rngd
systemctl enable rngd
yum install epel-release -y
yum install munge munge-libs munge-devel -y
//密钥同步
/usr/sbin/create-munge-key -r
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
scp -p /etc/munge/munge.key root@node01:/etc/munge/
scp -p /etc/munge/munge.key root@node02:/etc/munge/
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
systemctl start munge
systemctl enable munge
systemctl status munge
//本地查看凭据
munge -n
//本地解码
munge -n | unmunge
//验证compute node,远程解码
munge -n | ssh node01 unmunge
remunge
munge已经初步配置完成。
//yum安装mariadb数据库 yum -y install mariadb-server //启动mariadb systemctl start mariadb systemctl enable mariadb //输入mysql,第一次进去不需要密码,敲回车即可 mysql //设置密码 >set password=password('Wang@023878'); //创建slurm需要的数据库名称 >create database slurm_acct_db; //退出 >quit //重新进入 mysql -uroot -p'Wang@023878' //创建slurm需要的用户名 >create user slurm; //给slurm用户授权 >grant all on slurm_acct_db.* TO 'slurm'@'localhost' identified by '123456' with grant option; //刷新数据库并退出 >flush privileges; >quit
groupadd -g 1109 slurm
useradd -m -c "Slurm manager" -d /var/lib/slurm -u 1109 -g slurm -s /bin/bash slurm
id slurm
yum install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel python3 -y
//下载slurm源码
wget https://download.schedmd.com/slurm/slurm-22.05.3.tar.bz2
//部署rpm制作包
yum install rpm-build -y
//制作sulrm的rpm安装包
rpmbuild -ta --nodeps slurm-22.05.3.tar.bz2
//进入此文件夹
cd /root/rpmbuild/RPMS
//复制一份到共享存储里面
cp -R x86_64/ /public/
//管理节点部署
cd /root/rpmbuild/RPMS/x86_64
yum localinstall slurm-*
//计算节点部署
cd /public/x86_64
yum localinstall slurm-*
//配置文件复制并备份一份 cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf //给slurmdbd.conf文件授权(不授权后面的服务启动不了) chown -R slurm:slurm /etc/slurm/slurmdbd.conf chmod 600 /etc/slurm/slurmdbd.conf //slurm.conf的配置内容如下 vim /etc/slurm/slurm.conf ########配置文件如下####### # # slurm.conf file. Please run configurator.html # (in doc/html) to build a configuration file customized # for your environment. # # # slurm.conf file generated by configurator.html. # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # ################################################ # CONTROL # ################################################ ClusterName=steven #集群名称 SlurmctldHost=master #管理服务节点名称 SlurmctldPort=6817 #slurmctld服务端口 SlurmdPort=6818 #slurmd服务的端口 SlurmUser=slurm #slurm的主用户 #SlurmdUser=root #slurmd服务的启动用户 ################################################ # LOGGING & OTHER PATHS # ################################################ SlurmctldDebug=info SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdDebug=info SlurmdLogFile=/var/log/slurm/slurmd.log SlurmctldPidFile=/var/run/slurmctld.pid SlurmdPidFile=/var/run/slurmd.pid SlurmdSpoolDir=/var/spool/slurmd StateSaveLocation=/var/spool/slurmctld ################################################ # ACCOUNTING # ################################################ AccountingStorageEnforce=associations,limits,qos #account存储数据的配置选项 AccountingStorageHost=master #数据库存储节点 AccountingStoragePass=/var/run/munge/munge.socket.2 #munge认证文件,与slurmdbd.conf文件中的AuthInfo文件同名。 AccountingStoragePort=6819 #slurmd服务监听端口,默认为6819 AccountingStorageType=accounting_storage/slurmdbd #数据库记账服务 ################################################ # JOBS # ################################################ JobCompHost=localhost #作业完成信息的数据库本节点 JobCompLoc=slurm_acct_db #数据库名称 JobCompPass=123456 #slurm用户数据库密码 JobCompPort=3306 #数据库端口 JobCompType=jobcomp/mysql #作业完成信息数据存储类型,采用mysql数据库 JobCompUser=slurm #作业完成信息数据库用户名 JobContainerType=job_container/none JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux ################################################ # SCHEDULING & ALLOCATION # ################################################ SchedulerType=sched/backfill SelectType=select/cons_tres SelectTypeParameters=CR_Core ################################################ # TIMERS # ################################################ InactiveLimit=0 KillWait=30 MinJobAge=300 SlurmctldTimeout=120 SlurmdTimeout=300 Waittime=0 ################################################ # OTHER # ################################################ MpiDefault=none ProctrackType=proctrack/cgroup ReturnToService=1 SwitchType=switch/none TaskPlugin=task/affinity ################################################ # NODES # ################################################ NodeName=master NodeAddr=192.168.0.23 CPUs=1 CoresPerSocket=1 ThreadsPerCore=1 RealMemory=200 Procs=1 State=UNKNOWN NodeName=node[01-02] NodeAddr=192.168.0.2[4-5] CPUs=1 CoresPerSocket=1 ThreadsPerCore=1 RealMemory=200 Procs=1 State=UNKNOWN ################################################ # PARTITIONS # ################################################ PartitionName=compute Nodes=node[01-02] Default=YES MaxTime=INFINITE State=UP //打开slurmdbd的配置如 vim /etc/slurm/slurmdbd.conf ##############slurmdbd.conf文件配置如下################ # # slurmdbd.conf file. # # See the slurmdbd.conf man page for more information. # # Authentication info AuthType=auth/munge #认证方式,该处采用munge进行认证 AuthInfo=/var/run/munge/munge.socket.2 #为了与slurmctld控制节点通信的其它认证信息 # # slurmDBD info DbdAddr=master #数据库节点名 DbdHost=master #数据库IP地址 SlurmUser=slurm #用户数据库操作的用户 DebugLevel=verbose LogFile=/var/log/slurm/slurmdbd.log #slurmdbd守护进程日志文件绝对路径 PidFile=/var/run/slurmdbd.pid #slurmdbd守护进程存储进程号文件绝对路径 # # Database info StorageType=accounting_storage/mysql #数据存储类型 StoragePass=123456 #存储数据库密码 StorageUser=slurm #存储数据库用户名 StorageLoc=slurm_acct_db #数据库名称
配置中slurmdbd.conf文件不授权,启动时就会报以下错误,红色的字体
[root@master slurm]# systemctl status slurmdbd
● slurmdbd.service - Slurm DBD accounting daemon
Loaded: loaded (/usr/lib/systemd/system/slurmdbd.service; disabled; vendor preset: disabled)
Active: failed (Result: exit-code) since Wed 2024-07-31 11:10:02 CST; 2min 49s ago
Process: 4790 ExecStart=/usr/sbin/slurmdbd -D -s $SLURMDBD_OPTIONS (code=exited, status=1/FAILURE)
Main PID: 4790 (code=exited, status=1/FAILURE)
Jul 31 11:10:02 master systemd[1]: Started Slurm DBD accounting daemon.
Jul 31 11:10:02 master slurmdbd[4790]: slurmdbd: fatal: slurmdbd.conf not owned by SlurmUser root!=slurm
Jul 31 11:10:02 master systemd[1]: slurmdbd.service: main process exited, code=exited, status=1/FAILURE
Jul 31 11:10:02 master systemd[1]: Unit slurmdbd.service entered failed state.
Jul 31 11:10:02 master systemd[1]: slurmdbd.service failed.
for node in node01 node02; do echo "Copying to $node"; scp -r /etc/slurm/cgroup.conf root@$node:/etc/slurm/; done
for node in node01 node02; do echo "Copying to $node"; scp -r /etc/slurm/slurm.conf root@$node:/etc/slurm/; done
mkdir /var/spool/slurmd
chown slurm: /var/spool/slurmd
mkdir /var/log/slurm
chown slurm: /var/log/slurm
mkdir /var/spool/slurmctld
chown slurm: /var/spool/slurmctld
systemctl start slurmdbd
systemctl enable slurmdbd
systemctl start slurmctld
systemctl enable slurmctld
systemctl start slurmd
systemctl enable slurmd
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。