当前位置:   article > 正文

jieba库:Tokenizer()类详解(一)初始化_jieba.tokenizer

jieba.tokenizer

2021SC@SDUSC


看到代码:

  1. class Tokenizer(object):
  2. # 类初始化时对数据进行初始化。
  3. def __init__(self, dictionary=DEFAULT_DICT):
  4. # 进程锁,用于防止进程对同一个对象进行操作造成资源的争用,甚至导致死锁,或者读写混乱。
  5. self.lock = threading.RLock()
  6. '''
  7. 字典的选取,默认值为本文件夹下的dict.txt, 源码:
  8. DEFAULT_DICT = None
  9. DEFAULT_DICT_NAME = "dict.txt"
  10. '''
  11. if dictionary == DEFAULT_DICT:
  12. self.dictionary = dictionary
  13. else:
  14. self.dictionary = _get_abs_path(dictionary)
  15. # FREQ是用于存放词汇的词频的字典。
  16. self.FREQ = {}
  17. # 总词频(所有词频的标量和),用于关键词提取算法中进行权值排序
  18. self.total = 0
  19. # user_word_tag_tab用于存放词汇的词性。
  20. self.user_word_tag_tab = {}
  21. # jieba词典使用延迟加载技术,使用时加载,如果需要提前加载,可以使用jieba.initialize()加载
  22. self.initialized = False
  23. self.tmp_dir = None
  24. self.cache_file = None
  25. '''
  26. 通过重写类的 __repr__() 方法,输出某个实例化对象时,其调用的就是该对象的 __repr__() 方法,输出的是该方法的返回值。
  27. '''
  28. def __repr__(self):
  29. return '<Tokenizer dictionary=%r>' % self.dictionary
  30. # 静态方法
  31. @staticmethod
  32. # 获取文件(词典)的每个字的词频(字典)和所有字的总词频(int)
  33. def gen_pfdict(f):
  34. lfreq = {}
  35. ltotal = 0
  36. # 确认f为文件。源码:
  37. '''
  38. def resolve_filename(f):
  39. try:
  40. return f.name
  41. except AttributeError:
  42. return repr(f)
  43. '''
  44. f_name = resolve_filename(f)
  45. # 逐行读取词典,对所有的词进行统计
  46. for lineno, line in enumerate(f, 1):
  47. try:
  48. line = line.strip().decode('utf-8')
  49. # 读取前两个参数,词和词频
  50. word, freq = line.split(' ')[:2]
  51. freq = int(freq)
  52. # 把‘词汇’:词频 加入字典lfreq。 词频累加起来赋给 ltotal
  53. lfreq[word] = freq
  54. ltotal += freq
  55. # 把词汇的子词都添加到lfreq中,词频为0
  56. for ch in xrange(len(word)):
  57. wfrag = word[:ch + 1]
  58. if wfrag not in lfreq:
  59. lfreq[wfrag] = 0
  60. except ValueError:
  61. raise ValueError(
  62. 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
  63. f.close()
  64. return lfreq, ltotal
  65. # 初始化,加载词典
  66. def initialize(self, dictionary=None):
  67. # 词典选择,默认为 本文件夹下 dict.txt,可自定义文件。
  68. if dictionary:
  69. abs_path = _get_abs_path(dictionary)
  70. if self.dictionary == abs_path and self.initialized:
  71. return
  72. else:
  73. self.dictionary = abs_path
  74. self.initialized = False
  75. else:
  76. abs_path = self.dictionary
  77. # 使用锁。
  78. with self.lock:
  79. try:
  80. with DICT_WRITING[abs_path]:
  81. pass
  82. except KeyError:
  83. pass
  84. # 如果已经初始化就return
  85. if self.initialized:
  86. return
  87. default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
  88. t1 = time.time()
  89. if self.cache_file:
  90. cache_file = self.cache_file
  91. # default dictionary
  92. elif abs_path == DEFAULT_DICT:
  93. cache_file = "jieba.cache"
  94. # custom dictionary
  95. else:
  96. cache_file = "jieba.u%s.cache" % md5(
  97. abs_path.encode('utf-8', 'replace')).hexdigest()
  98. cache_file = os.path.join(
  99. self.tmp_dir or tempfile.gettempdir(), cache_file)
  100. # prevent absolute path in self.cache_file
  101. tmpdir = os.path.dirname(cache_file)
  102. load_from_cache_fail = True
  103. if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
  104. os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
  105. default_logger.debug(
  106. "Loading model from cache %s" % cache_file)
  107. # 如果cache_file是文件,则打开,并且把load_from_cache_fail置为False,表明从cache加载字典成功
  108. try:
  109. with open(cache_file, 'rb') as cf:
  110. self.FREQ, self.total = marshal.load(cf)
  111. load_from_cache_fail = False
  112. except Exception:
  113. load_from_cache_fail = True
  114. # 如果加载失败
  115. if load_from_cache_fail:
  116. wlock = DICT_WRITING.get(abs_path, threading.RLock())
  117. DICT_WRITING[abs_path] = wlock
  118. with wlock:
  119. self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
  120. default_logger.debug(
  121. "Dumping model to file cache %s" % cache_file)
  122. try:
  123. # prevent moving across different filesystems
  124. fd, fpath = tempfile.mkstemp(dir=tmpdir)
  125. with os.fdopen(fd, 'wb') as temp_cache_file:
  126. marshal.dump(
  127. (self.FREQ, self.total), temp_cache_file)
  128. _replace_file(fpath, cache_file)
  129. except Exception:
  130. default_logger.exception("Dump cache file failed.")
  131. try:
  132. del DICT_WRITING[abs_path]
  133. except KeyError:
  134. pass
  135. # 初始化分词器完成,置initialized为True
  136. self.initialized = True
  137. default_logger.debug(
  138. "Loading model cost %.3f seconds." % (time.time() - t1))
  139. default_logger.debug("Prefix dict has been built successfully.")
  140. # 检查是否初始化
  141. def check_initialized(self):
  142. if not self.initialized:
  143. self.initialize()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/329813
推荐阅读
相关标签
  

闽ICP备14008679号