赞
踩
- def read_large_file(m_fr):
- """
- 生成器函数,按行读取大文件
- :param m_fr:
- :return: 每行的内容
- """
- while True:
- line = m_fr.readline()
- if not line:
- break
- yield line
-
-
- # 按行数拆分文件
- # 将子文件存放到以文件名命名的文件夹中
- def file_split_quick(m_filepath, m_num, m_dirpath, m_num_dict):
- """
- 按行分割大文件
- :param m_filepath: 文件路径
- :param m_num: 每个分割文件的行数
- :param m_dirpath: 子文件存储目录
- :param m_num_dict: 记录子文件行数的字典
- """
- m_pathlist = []
- if not os.path.exists(m_filepath):
- print('error: not exist: {}'.format(m_filepath))
- assert 0 == 1
- if not os.path.exists(m_dirpath):
- os.makedirs(m_dirpath)
- m_filename = os.path.basename(m_filepath)
- m_out = []
- m_cmd = "wc -l {}".format(m_filepath)
- execute_command(m_cmd, m_out)
- m_total_num = int(m_out[0].split(' ')[0])
- if m_total_num > m_num:
- m_count = 0
- with open(m_filepath, 'r', encoding='utf-8') as m_fr:
- while True:
- m_lines = list(islice(read_large_file(m_fr), m_num))
- if not m_lines:
- break
- m_count += 1
- m_subpath = os.path.join(m_dirpath, os.path.splitext(m_filename)[0] + '_' + str(m_count).zfill(3) + os.path.splitext(m_filename)[1])
- m_fw = open(m_subpath, 'w', encoding='utf-8')
- m_fw.writelines(m_lines)
- m_fw.close()
- m_pathlist.append(m_subpath)
- m_num_dict[m_subpath] = len(m_lines)
- print('done: {} {}'.format(m_num_dict[m_subpath], m_subpath))
-
- else:
- m_newpath = os.path.join(m_dirpath, m_filename)
- m_pathlist.append(m_newpath)
- m_num_dict[m_newpath] = m_total_num
- shutil.copyfile(m_filepath, m_newpath)
- return m_pathlist
ChatGPT真是个好东西!
用linux命令拆分:
- # 命令:split 文件路径 分割出的文件前缀
- # 示例:
- split test.txt test_
-
- # 文件按行分割
- # -l 设置行数
- # -a 指定后缀长度(默认为2)
- # --numeric-suffixes=1 指定数字起始值(会影响分割效率)
- # --additional-suffix=.txt 指定分割出来的文件格式(会影响分割效率)
- split -l 10000 test.txt test_ -a 3 --numeric-suffixes=1 --additional-suffix=.txt
-
- # 查看文件前10行内容
- head -n 10 test_001.txt
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。