赞
踩
关于特征选择相关的知识可以参考一下连接
项目源码里面包含Java和Python的实现,这里只列出Python实现:
代码托管:https://github.com/fighting-one-piece/repository-datamining.git
- class Doc:
-
- def __init__(self, name):
- self._name = name
-
- def setName(self, name):
- self._name = name
-
- def getName(self):
- return self._name
-
- def setCategory(self, category):
- self._category = category
-
- def getCategory(self):
- return self._category
-
- def setWords(self, words):
- self._words = words
-
- def getWords(self):
- return self._words
-
- def setTfidfWords(self, tfidfWords):
- self._tfidfWords = tfidfWords
-
- def getTfidfWords(self):
- return self._tfidfWords
-
- def getSortedTfidfWords(self):
- results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ]
- return results
-
- def setCHIWords(self, chiWords):
- self._chiWords = chiWords
-
- def getCHIWords(self):
- return self._chiWords
-
- def setSimilarities(self, similarities):
- self._similarities = similarities
-
- def getSimilarities(self):
- return self._similarities
- #文档操作工具类
- class DocHelper:
-
- #获取目录下所有的文档
- @staticmethod
- def genDocs(path):
- docs = []
- DocHelper.genDocsIterator(path, docs)
- return docs
-
- #遍历目录获取目录下所有的文档
- @staticmethod
- def genDocsIterator(path, docs):
- if os.path.isdir(path):
- for subPathName in os.listdir(path):
- subPath = os.path.join(path, subPathName)
- DocHelper.genDocsIterator(subPath, docs)
- else:
- name = path[path.rfind('\\') + 1 : path.rfind('.')]
- doc = Doc(name)
- doc.setCategory(path.split('\\')[-2])
- doc.setWords(WordUtils.splitFile(path));
- docs.append(doc)
-
- #文档中是否包含指定词
- @staticmethod
- def docHasWord(doc, word):
- for dword in doc.getWords():
- if dword == word:
- return True
- return False
-
- #文档中词频统计
- @staticmethod
- def docWordsStatistics(doc):
- map = {}
- for word in doc.getWords():
- count = map.get(word)
- if count is None:
- count = 0
- map[word] = co
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。