赞
踩
JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式。第一次接触到它是在进行服务器端接口测试的时候。现在很多服务器返回的结果都是json格式。主要是由于它比较容易解析和生成。JSON格式的数据本质上一种被格式化了的字符串。
json.dumps()把一个Python对象编,码转换成Json字符串。
dumps操作的是字符串
dump操作的是文件流
import json
data = {"postag": [{"word": "内容", "pos": "n"}, {"word": "简介", "pos": "n"},
{"word": "《", "pos": "w"}, {"word": "宜兴紫砂图典", "pos": "nw"},
{"word": "》", "pos": "w"}, {"word": "由", "pos": "p"},
{"word": "故宫出版社", "pos": "nt"}, {"word": "出版", "pos": "v"}],
"text": "内容简介《宜兴紫砂图典》由故宫出版社出版",
"spo_list": [{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍",
"object": "故宫出版社", "subject": "宜兴紫砂图典"}]}
data2 = json.dumps(data)
print(data2)
{"postag": [{"word": "\u5185\u5bb9", "pos": "n"}, {"word": "\u7b80\u4ecb", "pos": "n"},
{"word": "\u300a", "pos": "w"}, {"word": "\u5b9c\u5174\u7d2b\u7802\u56fe\u5178", "pos": "nw"},
{"word": "\u300b", "pos": "w"}, {"word": "\u7531", "pos": "p"},
{"word": "\u6545\u5bab\u51fa\u7248\u793e", "pos": "nt"}, {"word": "\u51fa\u7248", "pos": "v"}],
"text": "\u5185\u5bb9\u7b80\u4ecb\u300a\u5b9c\u5174\u7d2b\u7802\u56fe\u5178\u300b
\u7531\u6545\u5bab\u51fa\u7248\u793e\u51fa\u7248", "spo_list":
[{"predicate": "\u51fa\u7248\u793e", "object_type": "\u51fa\u7248\u793e",
"subject_type": "\u4e66\u7c4d", "object": "\u6545\u5bab\u51fa\u7248\u793e",
"subject": "\u5b9c\u5174\u7d2b\u7802\u56fe\u5178"}]}
sort_keys =True:是告诉编码器按照字典排序(a到z)输出。如果是字典类型的python对象,就把关键字按照字典排序。
indent:参数根据数据格式缩进显示,读起来更加清晰。
separators:是分隔符的意思,参数意思分别为不同dict项之间的分隔符和dict项内key和value之间的分隔符,把:和,后面的空格都除去了。
## 按关键字排序
# indent表示每行缩进2
data3 = json.dumps(data, sort_keys=True,indent=2,ensure_ascii=False)
print(data3)
{ "postag": [ { "pos": "n", "word": "内容" }, { "pos": "n", "word": "简介" }, { "pos": "w", "word": "《" }, { "pos": "nw", "word": "宜兴紫砂图典" }, { "pos": "w", "word": "》" }, { "pos": "p", "word": "由" }, { "pos": "nt", "word": "故宫出版社" }, { "pos": "v", "word": "出版" } ], "spo_list": [ { "object": "故宫出版社", "object_type": "出版社", "predicate": "出版社", "subject": "宜兴紫砂图典", "subject_type": "书籍" } ], "text": "内容简介《宜兴紫砂图典》由故宫出版社出版" }
data4 = json.dumps(data,separators=(',',':'),ensure_ascii=False)
print(data4)
{"postag":[{"word":"内容","pos":"n"},{"word":"简介","pos":"n"},{"word":"《","pos":"w"},{"word":"宜兴紫砂图典","pos":"nw"},{"word":"》","pos":"w"},
{"word":"由","pos":"p"},{"word":"故宫出版社","pos":"nt"},{"word":"出版","pos":"v"}],"text":"内容简介《宜兴紫砂图典》由故宫出版社出版","spo_list":
[{"predicate":"出版社","object_type":"出版社","subject_type":"书籍","object":"故宫出版社","subject":"宜兴紫砂图典"}]}
json.loads()把Json格式字符串解码,转换成Python对象。
loads操作的是字符串
load操作的是文件流
print(json.loads(data3))
{'postag': [{'pos': 'n', 'word': '内容'}, {'pos': 'n', 'word': '简介'}, {'pos': 'w', 'word': '《'}, {'pos': 'nw', 'word': '宜兴紫砂图典'}, {'pos': 'w', 'word': '》'},
{'pos': 'p', 'word': '由'}, {'pos': 'nt', 'word': '故宫出版社'}, {'pos': 'v', 'word': '出版'}], 'spo_list': [{'object': '故宫出版社', 'object_type': '出版社',
'predicate': '出版社', 'subject': '宜兴紫砂图典', 'subject_type': '书籍'}], 'text': '内容简介《宜兴紫砂图典》由故宫出版社出版'}
# 写进json 数据
with open('data.json', 'w') as f:
json.dump(d1, f)
# 读取json数据
with open('data.json', 'r') as f:
data = json.load(f)
tqdm中的tqdm()是实现进度条美化的基本方法,在for循环体中用tqdm()包裹指定的迭代器或range()即可,下面是个简单的例子:
import json
from tqdm import tqdm
import codecs
import os
all_50_schemas=set()
with open(r'D:\学习·\自然语言处理\数据集\DuIE_2_0\2019_data\all_50_schemas','r',encoding='utf-8',) as f:
for l in tqdm(f):
a = json.loads(l)
all_50_schemas.add(a['predicate'])
print(all_50_schemas)
50it [00:00, ?it/s]
{'出品公司', '作者', '官方语言', '毕业院校', '注册资本', '目', '导演', '面积', '海拔', '改编自', '人口数量', '作词', '丈夫', '妻子', '出生日期', '祖籍', '气候', '成
立日期', '首都', '专业代码', '嘉宾', '创始人', '出生地', '字', '所属专辑', '邮政编码', '作曲', '母亲', '号', '编剧', '简称', '国籍', '所在城市', '歌手', '制片人',
'连载网站', '修业年限', '董事长', '朝代', '出版社', '占地面积', '民族', '主角', '主持人', '主演', '身高', '父亲', '总部地点', '上映时间'}
path = r'D:\学习·\自然语言处理\数据集\DuIE_2_0\2019_data\train_data.json'
def read_json(src_filename):
with open(src_filename,encoding='utf-8' ) as f:
return json.load(f)
datastore = read_json(path)
print(datastore[1:3])
--------------------------------------------------------------------------- JSONDecodeError Traceback (most recent call last) <ipython-input-3-340a0261b343> in <module> 5 return json.load(f) 6 ----> 7 datastore = read_json(path) 8 9 print(datastore[1:3]) <ipython-input-3-340a0261b343> in read_json(src_filename) 3 def read_json(src_filename): 4 with open(src_filename,encoding='utf-8' ) as f: ----> 5 return json.load(f) 6 7 datastore = read_json(path) D:\study_Software\Anaconda3\lib\json\__init__.py in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 294 cls=cls, object_hook=object_hook, 295 parse_float=parse_float, parse_int=parse_int, --> 296 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw) 297 298 D:\study_Software\Anaconda3\lib\json\__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 346 parse_int is None and parse_float is None and 347 parse_constant is None and object_pairs_hook is None and not kw): --> 348 return _default_decoder.decode(s) 349 if cls is None: 350 cls = JSONDecoder D:\study_Software\Anaconda3\lib\json\decoder.py in decode(self, s, _w) 338 end = _w(s, end).end() 339 if end != len(s): --> 340 raise JSONDecodeError("Extra data", s, end) 341 return obj 342 JSONDecodeError: Extra data: line 2 column 1 (char 395)
大量数据,里面有多行多列,出现类似标题报错
raise JSONDecodeError(“Extra data”, s, end)
Extra data: line 2 column 1 (char 395)
可以逐行读取,然后再处理成列表
def read_jsonline(src_filename, encoding='utf-8' , *, default=None): """ read jsonl file :param src_filename: source file path :param encoding: file encoding :param default: default value to return if file is not existed. Set it to None to disable it. :return: object list, an object corresponding a line """ if default is not None and not os.path.exists(src_filename): return default file =open(src_filename, encoding=encoding) items = [] for line in file: items.append(json.loads(line)) file.close() return items datastore = read_jsonline(path) print(datastore[1])
{'postag': [{'word': '《', 'pos': 'w'}, {'word': '中国风水十讲', 'pos': 'nw'}, {'word': '》', 'pos': 'w'}, {'word': '是', 'pos': 'v'}, {'word': '2007年', 'pos': 't'}, {'word': '华夏出版社', 'pos': 'nt'}, {'word': '出版', 'pos': 'v'}, {'word': '的', 'pos': 'u'}, {'word': '图书', 'pos': 'n'}, {'word': ',', 'pos': 'w'}, {'word': '作者', 'pos': 'n'}, {'word': '是', 'pos': 'v'}, {'word': '杨文衡', 'pos': 'nr'}], 'text': '《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡', 'spo_list': [{'predicate': '出版社', 'object_type': '出版社', 'subject_type': '书籍', 'object': '华夏出版社', 'subject': '中国风水十讲'}, {'predicate': '作者', 'object_type': '人物', 'subject_type': '图书作品', 'object': '杨文衡', 'subject': '中国风水十讲'}]}
train_data = []
chars={}
with open(r'D:\学习·\自然语言处理\数据集\DuIE_2_0\2019_data\train_data.json','r',encoding='utf-8') as f:
for line in f.readlines():
datastore = json.loads(line)
train_data.append(
{
'text': datastore['text'],
'spo_list': [(i['subject'], i['predicate'], i['object']) for i in datastore['spo_list']]
}
)
for c in datastore['text']:
chars[c] = chars.get(c, 0) + 1
print(train_data[1:3])
[{'text': '《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡', 'spo_list': [('中国风水十讲', '出版社', '华夏出版社'), ('中国风水十讲', '作者', '杨文衡')]}, {'text': '《空城未央》是夙言以信创作的网络小说,发表于17K小说网', 'spo_list': [('空城未央', '作者', '夙言以信'), ('空城未央', '连载网站', '17K小说网')]}]
def write_jsonline(dest_filename, items, encoding='utf-8'): """write items to file with json line format :param dest_filename: destination file path :param items: items to be saved line by line :param encoding: file encoding :return: """ if isinstance(items, str): raise TypeError('json object list can\'t be str') if not dest_filename.endswith('.jsonl'): print('json line filename doesn\'t end with .jsonl') if not isinstance(items, Iterable): raise TypeError('items can\'t be iterable') file = open(dest_filename, "w",encoding = encoding) for item in items: file.write(json.dumps(item, ensure_ascii = False) + '\n') file.close()
https://blog.csdn.net/xyz1584172808/article/details/82117220
https://wuwt.me/2017/08/21/pre-trained-embedding-keras/
https://www.eliyar.biz/using-pre-trained-gensim-word2vector-in-a-keras-model-and-visualizing/
https://m.imooc.com/article/295512
https://radimrehurek.com/gensim/models/word2vec.html
https://blog.csdn.net/lilong117194/article/details/82849054
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。