赞
踩
import os from multiprocessing import Pool def safe_readline(f): pos = f.tell() while True: try: return f.readline() except UnicodeDecodeError: pos -= 1 f.seek(pos) def async_kd_tokenizer(filename, worker_id, num_workers): with open(filename, 'r') as f: size = os.fstat(f.fileno()).st_size # 指针操作,所以无视文件大小 print(f'size {size}') chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) print(f'offset {offset}') if offset > 0: safe_readline(f) # drop first incomplete line lines = [] line = f.readline() while line: line = line.replace(" ", '').replace("\n", '') if not line: line = f.readline() continue lines.append(line) if f.tell() > end: break line = f.readline() return lines def encode_file(path, workers=4): assert os.path.exists(path) results = [] workers_thread = [] pool = Pool(processes=workers) for i in range(workers): w = pool.apply_async( async_kd_tokenizer, (path, i, workers), ) workers_thread.append(w) pool.close() pool.join() for w in workers_thread: result = w.get() results += result return results results = encode_file('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4) print(results)
import threading class FileHandlerThread(threading.Thread): def __init__(self, func, args): super(FileHandlerThread, self).__init__() self.args = args self.func = func def run(self): self.result = self.func(*self.args) def get_result(self): try: return self.result except Exception: return None def encode_file_thread(path, workers=4): assert os.path.exists(path) results = [] workers_thread = [] for i in range(workers): w = FileHandlerThread(async_kd_tokenizer, args=(path, i, workers)) workers_thread.append(w) w.start() for w in workers_thread: w.join() for w in workers_thread: result = w.get_result() results += result return results results_th = encode_file_thread('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4) print(results_th) print(results_th == results) # True
参考:https://www.liujiangblog.com/course/python/79
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。