赞
踩
- import pandas as pd
- raw = pd.read_table('../data/金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding="GBK")
- print(len(raw))
- raw
- # 章节判断用变量预处理
- def m_head(tmpstr):
- return tmpstr[:1]
-
- def m_mid(tmpstr):
- return tmpstr.find("回 ")
-
- raw['head'] = raw.txt.apply(m_head)
- raw['mid'] = raw.txt.apply(m_mid)
- raw['len'] = raw.txt.apply(len)
- raw.head(50)
- # 章节判断
- chapnum = 0
- for i in range(len(raw)):
- if raw['head'][i] == "第" and raw['mid'][i] > 0 and raw['len'][i] < 30:
- chapnum += 1
- if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
- chapnum = 0
- raw.loc[i, 'chap'] = chapnum
-
- raw.head(50)
- # 删除临时变量
- del raw['head']
- del raw['mid']
- del raw['len']
- raw.head(50)
- rawgrp = raw.groupby('chap')
- chapter = rawgrp.agg(sum) # 只有字符串的情况下,sum函数自动转为合并字符串
- chapter = chapter[chapter.index != 0]
- chapter
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。