赞
踩
2021SC@SDUSC
看到代码:
- class Tokenizer(object):
- # 类初始化时对数据进行初始化。
- def __init__(self, dictionary=DEFAULT_DICT):
- # 进程锁,用于防止进程对同一个对象进行操作造成资源的争用,甚至导致死锁,或者读写混乱。
- self.lock = threading.RLock()
- '''
- 字典的选取,默认值为本文件夹下的dict.txt, 源码:
- DEFAULT_DICT = None
- DEFAULT_DICT_NAME = "dict.txt"
- '''
- if dictionary == DEFAULT_DICT:
- self.dictionary = dictionary
- else:
- self.dictionary = _get_abs_path(dictionary)
- # FREQ是用于存放词汇的词频的字典。
- self.FREQ = {}
- # 总词频(所有词频的标量和),用于关键词提取算法中进行权值排序
- self.total = 0
- # user_word_tag_tab用于存放词汇的词性。
- self.user_word_tag_tab = {}
- # jieba词典使用延迟加载技术,使用时加载,如果需要提前加载,可以使用jieba.initialize()加载
- self.initialized = False
- self.tmp_dir = None
- self.cache_file = None
-
- '''
- 通过重写类的 __repr__() 方法,输出某个实例化对象时,其调用的就是该对象的 __repr__() 方法,输出的是该方法的返回值。
- '''
-
- def __repr__(self):
- return '<Tokenizer dictionary=%r>' % self.dictionary
-
- # 静态方法
- @staticmethod
- # 获取文件(词典)的每个字的词频(字典)和所有字的总词频(int)
- def gen_pfdict(f):
- lfreq = {}
- ltotal = 0
- # 确认f为文件。源码:
- '''
- def resolve_filename(f):
- try:
- return f.name
- except AttributeError:
- return repr(f)
- '''
- f_name = resolve_filename(f)
- # 逐行读取词典,对所有的词进行统计
- for lineno, line in enumerate(f, 1):
- try:
- line = line.strip().decode('utf-8')
- # 读取前两个参数,词和词频
- word, freq = line.split(' ')[:2]
- freq = int(freq)
- # 把‘词汇’:词频 加入字典lfreq。 词频累加起来赋给 ltotal
- lfreq[word] = freq
- ltotal += freq
- # 把词汇的子词都添加到lfreq中,词频为0
- for ch in xrange(len(word)):
- wfrag = word[:ch + 1]
- if wfrag not in lfreq:
- lfreq[wfrag] = 0
- except ValueError:
- raise ValueError(
- 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
- f.close()
- return lfreq, ltotal
-
- # 初始化,加载词典
- def initialize(self, dictionary=None):
- # 词典选择,默认为 本文件夹下 dict.txt,可自定义文件。
- if dictionary:
- abs_path = _get_abs_path(dictionary)
- if self.dictionary == abs_path and self.initialized:
- return
- else:
- self.dictionary = abs_path
- self.initialized = False
- else:
- abs_path = self.dictionary
- # 使用锁。
- with self.lock:
- try:
- with DICT_WRITING[abs_path]:
- pass
- except KeyError:
- pass
- # 如果已经初始化就return
- if self.initialized:
- return
-
- default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
- t1 = time.time()
- if self.cache_file:
- cache_file = self.cache_file
- # default dictionary
- elif abs_path == DEFAULT_DICT:
- cache_file = "jieba.cache"
- # custom dictionary
- else:
- cache_file = "jieba.u%s.cache" % md5(
- abs_path.encode('utf-8', 'replace')).hexdigest()
- cache_file = os.path.join(
- self.tmp_dir or tempfile.gettempdir(), cache_file)
- # prevent absolute path in self.cache_file
- tmpdir = os.path.dirname(cache_file)
-
- load_from_cache_fail = True
- if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
- os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
- default_logger.debug(
- "Loading model from cache %s" % cache_file)
- # 如果cache_file是文件,则打开,并且把load_from_cache_fail置为False,表明从cache加载字典成功
- try:
- with open(cache_file, 'rb') as cf:
- self.FREQ, self.total = marshal.load(cf)
- load_from_cache_fail = False
- except Exception:
- load_from_cache_fail = True
- # 如果加载失败
- if load_from_cache_fail:
- wlock = DICT_WRITING.get(abs_path, threading.RLock())
- DICT_WRITING[abs_path] = wlock
- with wlock:
- self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
- default_logger.debug(
- "Dumping model to file cache %s" % cache_file)
- try:
- # prevent moving across different filesystems
- fd, fpath = tempfile.mkstemp(dir=tmpdir)
- with os.fdopen(fd, 'wb') as temp_cache_file:
- marshal.dump(
- (self.FREQ, self.total), temp_cache_file)
- _replace_file(fpath, cache_file)
- except Exception:
- default_logger.exception("Dump cache file failed.")
-
- try:
- del DICT_WRITING[abs_path]
- except KeyError:
- pass
- # 初始化分词器完成,置initialized为True
- self.initialized = True
- default_logger.debug(
- "Loading model cost %.3f seconds." % (time.time() - t1))
- default_logger.debug("Prefix dict has been built successfully.")
-
- # 检查是否初始化
- def check_initialized(self):
- if not self.initialized:
- self.initialize()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。