当前位置:   article > 正文

python多线程、多进程处理单个(大,超大)文件_python多线程处理大文件

python多线程处理大文件

1 多进程

import os
from multiprocessing import Pool


def safe_readline(f):
    pos = f.tell()
    while True:
        try:
            return f.readline()
        except UnicodeDecodeError:
            pos -= 1
            f.seek(pos)
            
            
def async_kd_tokenizer(filename, worker_id, num_workers):
    with open(filename, 'r') as f:
        size = os.fstat(f.fileno()).st_size  # 指针操作,所以无视文件大小
        print(f'size {size}')
        chunk_size = size // num_workers
        offset = worker_id * chunk_size
        end = offset + chunk_size
        f.seek(offset)
        print(f'offset {offset}')
        if offset > 0:
            safe_readline(f)    # drop first incomplete line
        lines = []
        line = f.readline()
        while line:
            line = line.replace(" ", '').replace("\n", '')
            if not line:
                line = f.readline()
                continue
            lines.append(line)
            if f.tell() > end:
                break
            line = f.readline()
        return lines
    
    
def encode_file(path, workers=4):
    assert os.path.exists(path)
    results = []
    workers_thread = []
    pool = Pool(processes=workers)
    for i in range(workers):
        w = pool.apply_async(
            async_kd_tokenizer,
            (path, i, workers),
        )
        workers_thread.append(w)
    pool.close()
    pool.join()

    for w in workers_thread:
        result = w.get() 
        results += result
    return results

results = encode_file('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4)
print(results)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60

2 多线程

import threading


class FileHandlerThread(threading.Thread):

    def __init__(self, func, args):
        super(FileHandlerThread, self).__init__()
        self.args = args
        self.func = func
        
    def run(self):
        self.result = self.func(*self.args)
    
    def get_result(self):
        try:
            return self.result
        except Exception:
            return None

    
def encode_file_thread(path, workers=4):
    assert os.path.exists(path)
    results = []
    workers_thread = []
    for i in range(workers):
        w = FileHandlerThread(async_kd_tokenizer, args=(path, i, workers))
        workers_thread.append(w)
        w.start()
    for w in workers_thread:
        w.join()
    for w in workers_thread:
        result = w.get_result() 
        results += result
    return results
	
results_th = encode_file_thread('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4)
print(results_th)

print(results_th == results)  # True
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39

参考:https://www.liujiangblog.com/course/python/79

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/625823
推荐阅读
相关标签
  

闽ICP备14008679号