赞
踩
1.系统环境
硬件环境(Ascend/GPU/CPU): Ascend
执行模式:静态图 ms2.1.1
Python版本:3.7
操作系统平台:Linux
2. 报错信息
2.1 问题描述
使用MindSpore跑大模型报以下错误:
- Exception in thread Thread-1:
- Traceback (most recent call last):
- File "/usr/local/python3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
- Exception in thread Thread-2:
- Traceback (most recent call last):
- File "/usr/local/python3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
- self.run()
- File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/utils/multiprocess_util.py", line 91, in run
- self.run()
- File "/usr/local/python3.9/lib/python3.9/threading.py", line 910, in run
- key, func, args, kwargs = self.task_q.get(timeout=TIMEOUT)
- File "<string>", line 2, in get
- self._target(*self._args, **self._kwargs)
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/pool.py", line 513, in _handle_workers
- cls._maintain_pool(ctx, Process, processes, pool, inqueue,
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/pool.py", line 337, in _maintain_pool
- Pool._repopulate_pool_static(ctx, Process, processes, pool,
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/pool.py", line 326, in _repopulate_pool_static
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 809, in _callmethod
- w.start()
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 121, in start
- self._popen = self._Popen(self)
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/context.py", line 291, in _Popen
- conn.send((self._id, methodname, args, kwds))
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 211, in send
- return Popen(process_obj)
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/popen_forkserver.py", line 35, in __init__
- self._send_bytes(_ForkingPickler.dumps(obj))
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 416, in _send_bytes
- super().__init__(process_obj)
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
- self._send(header + buf)
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 373, in _send
- self._launch(process_obj)
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/popen_forkserver.py", line 58, in _launch
- n = write(self._handle, buf)
- BrokenPipeError: [Errno 32] Broken pipe
- f.write(buf.getbuffer())
- BrokenPipeError: [Errno 32] Broken pipe
- Exception in thread Thread-1:
- Traceback (most recent call last):
- File "/usr/local/python3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
- self.run()
- File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/utils/multiprocess_util.py", line 91, in run
- Exception in thread Thread-1:
- Traceback (most recent call last):
- File "/usr/local/python3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
- Exception in thread Thread-1:
- Traceback (most recent call last):
- File "/usr/local/python3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
- self.run()
- File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/utils/multiprocess_util.py", line 91, in run
- key, func, args, kwargs = self.task_q.get(timeout=TIMEOUT)
- File "<string>", line 2, in get
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
- kind, result = conn.recv()
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 255, in recv
- buf = self._recv_bytes()
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
- buf = self._recv(4)
- File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
- raise EOFError
- EOFError
复制
3.解决方法
预训练模型太大,导致在加载模型的时候host内存消耗完毕,系统会选择性清理一些进程,以释放一些被占用的内存,导致报此错误。
建议排查方向:host内存是否占满,以及内存耗光的原因。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。