当前位置:   article > 正文

json格式的文本处理_postag spo_list object_type 使用什么工具

postag spo_list object_type 使用什么工具

JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式。第一次接触到它是在进行服务器端接口测试的时候。现在很多服务器返回的结果都是json格式。主要是由于它比较容易解析和生成。JSON格式的数据本质上一种被格式化了的字符串。

编码

json.dumps()把一个Python对象编,码转换成Json字符串。

  • dumps操作的是字符串

  • dump操作的是文件流

import json

data =  {"postag": [{"word": "内容", "pos": "n"}, {"word": "简介", "pos": "n"},
                    {"word": "《", "pos": "w"}, {"word": "宜兴紫砂图典", "pos": "nw"},
                    {"word": "》", "pos": "w"}, {"word": "由", "pos": "p"}, 
                    {"word": "故宫出版社", "pos": "nt"}, {"word": "出版", "pos": "v"}], 
         "text": "内容简介《宜兴紫砂图典》由故宫出版社出版", 
         "spo_list": [{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", 
                       "object": "故宫出版社", "subject": "宜兴紫砂图典"}]}


data2 = json.dumps(data)
print(data2)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
{"postag": [{"word": "\u5185\u5bb9", "pos": "n"}, {"word": "\u7b80\u4ecb", "pos": "n"}, 
{"word": "\u300a", "pos": "w"}, {"word": "\u5b9c\u5174\u7d2b\u7802\u56fe\u5178", "pos": "nw"}, 
{"word": "\u300b", "pos": "w"}, {"word": "\u7531", "pos": "p"},
{"word": "\u6545\u5bab\u51fa\u7248\u793e", "pos": "nt"}, {"word": "\u51fa\u7248", "pos": "v"}], 
"text": "\u5185\u5bb9\u7b80\u4ecb\u300a\u5b9c\u5174\u7d2b\u7802\u56fe\u5178\u300b
\u7531\u6545\u5bab\u51fa\u7248\u793e\u51fa\u7248", "spo_list": 
[{"predicate": "\u51fa\u7248\u793e", "object_type": "\u51fa\u7248\u793e", 
"subject_type": "\u4e66\u7c4d", "object": "\u6545\u5bab\u51fa\u7248\u793e",
"subject": "\u5b9c\u5174\u7d2b\u7802\u56fe\u5178"}]}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

sort_keys =True:是告诉编码器按照字典排序(a到z)输出。如果是字典类型的python对象,就把关键字按照字典排序。

indent:参数根据数据格式缩进显示,读起来更加清晰。

separators:是分隔符的意思,参数意思分别为不同dict项之间的分隔符和dict项内key和value之间的分隔符,把:和,后面的空格都除去了。


## 按关键字排序
# indent表示每行缩进2
data3 = json.dumps(data, sort_keys=True,indent=2,ensure_ascii=False)
print(data3)
  • 1
  • 2
  • 3
  • 4
  • 5
{
  "postag": [
    {
      "pos": "n",
      "word": "内容"
    },
    {
      "pos": "n",
      "word": "简介"
    },
    {
      "pos": "w",
      "word": "《"
    },
    {
      "pos": "nw",
      "word": "宜兴紫砂图典"
    },
    {
      "pos": "w",
      "word": "》"
    },
    {
      "pos": "p",
      "word": "由"
    },
    {
      "pos": "nt",
      "word": "故宫出版社"
    },
    {
      "pos": "v",
      "word": "出版"
    }
  ],
  "spo_list": [
    {
      "object": "故宫出版社",
      "object_type": "出版社",
      "predicate": "出版社",
      "subject": "宜兴紫砂图典",
      "subject_type": "书籍"
    }
  ],
  "text": "内容简介《宜兴紫砂图典》由故宫出版社出版"
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
data4 = json.dumps(data,separators=(',',':'),ensure_ascii=False)
print(data4)
  • 1
  • 2
{"postag":[{"word":"内容","pos":"n"},{"word":"简介","pos":"n"},{"word":"《","pos":"w"},{"word":"宜兴紫砂图典","pos":"nw"},{"word":"》","pos":"w"},
{"word":"由","pos":"p"},{"word":"故宫出版社","pos":"nt"},{"word":"出版","pos":"v"}],"text":"内容简介《宜兴紫砂图典》由故宫出版社出版","spo_list":
[{"predicate":"出版社","object_type":"出版社","subject_type":"书籍","object":"故宫出版社","subject":"宜兴紫砂图典"}]}
  • 1
  • 2
  • 3

解码

json.loads()把Json格式字符串解码,转换成Python对象。

  • loads操作的是字符串

  • load操作的是文件流

print(json.loads(data3))
  • 1
{'postag': [{'pos': 'n', 'word': '内容'}, {'pos': 'n', 'word': '简介'}, {'pos': 'w', 'word': '《'}, {'pos': 'nw', 'word': '宜兴紫砂图典'}, {'pos': 'w', 'word': '》'}, 
{'pos': 'p', 'word': '由'}, {'pos': 'nt', 'word': '故宫出版社'}, {'pos': 'v', 'word': '出版'}], 'spo_list': [{'object': '故宫出版社', 'object_type': '出版社', 
'predicate': '出版社', 'subject': '宜兴紫砂图典', 'subject_type': '书籍'}], 'text': '内容简介《宜兴紫砂图典》由故宫出版社出版'}
  • 1
  • 2
  • 3

读与写

# 写进json 数据
with open('data.json', 'w') as f:
    json.dump(d1, f)
# 读取json数据
with open('data.json', 'r') as f:
    data = json.load(f)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

tqdm

tqdm中的tqdm()是实现进度条美化的基本方法,在for循环体中用tqdm()包裹指定的迭代器或range()即可,下面是个简单的例子:

import json
from tqdm import tqdm
import codecs
import os

all_50_schemas=set()

with open(r'D:\学习·\自然语言处理\数据集\DuIE_2_0\2019_data\all_50_schemas','r',encoding='utf-8',) as f:
    for l in tqdm(f):
        a = json.loads(l)
        all_50_schemas.add(a['predicate'])

print(all_50_schemas)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
50it [00:00, ?it/s]
{'出品公司', '作者', '官方语言', '毕业院校', '注册资本', '目', '导演', '面积', '海拔', '改编自', '人口数量', '作词', '丈夫', '妻子', '出生日期', '祖籍', '气候', '成
立日期', '首都', '专业代码', '嘉宾', '创始人', '出生地', '字', '所属专辑', '邮政编码', '作曲', '母亲', '号', '编剧', '简称', '国籍', '所在城市', '歌手', '制片人', 
'连载网站', '修业年限', '董事长', '朝代', '出版社', '占地面积', '民族', '主角', '主持人', '主演', '身高', '父亲', '总部地点', '上映时间'}
  • 1
  • 2
  • 3
  • 4

读函数

path = r'D:\学习·\自然语言处理\数据集\DuIE_2_0\2019_data\train_data.json'

def read_json(src_filename):
    with open(src_filename,encoding='utf-8' ) as f:
        return json.load(f)

datastore =  read_json(path)

print(datastore[1:3])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
---------------------------------------------------------------------------

JSONDecodeError                           Traceback (most recent call last)

<ipython-input-3-340a0261b343> in <module>
      5         return json.load(f)
      6 
----> 7 datastore =  read_json(path)
      8 
      9 print(datastore[1:3])


<ipython-input-3-340a0261b343> in read_json(src_filename)
      3 def read_json(src_filename):
      4     with open(src_filename,encoding='utf-8' ) as f:
----> 5         return json.load(f)
      6 
      7 datastore =  read_json(path)


D:\study_Software\Anaconda3\lib\json\__init__.py in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    294         cls=cls, object_hook=object_hook,
    295         parse_float=parse_float, parse_int=parse_int,
--> 296         parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
    297 
    298 


D:\study_Software\Anaconda3\lib\json\__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    346             parse_int is None and parse_float is None and
    347             parse_constant is None and object_pairs_hook is None and not kw):
--> 348         return _default_decoder.decode(s)
    349     if cls is None:
    350         cls = JSONDecoder


D:\study_Software\Anaconda3\lib\json\decoder.py in decode(self, s, _w)
    338         end = _w(s, end).end()
    339         if end != len(s):
--> 340             raise JSONDecodeError("Extra data", s, end)
    341         return obj
    342 


JSONDecodeError: Extra data: line 2 column 1 (char 395)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45

大量数据,里面有多行多列,出现类似标题报错

raise JSONDecodeError(“Extra data”, s, end)

Extra data: line 2 column 1 (char 395)

可以逐行读取,然后再处理成列表


def read_jsonline(src_filename, encoding='utf-8' , *, default=None):
    """
    read jsonl file
    :param src_filename: source file path
    :param encoding: file encoding
    :param default: default value to return if file is not existed. Set it to None to disable it.
    :return: object list, an object corresponding a line
    """
    if default is not None and not os.path.exists(src_filename):
        return default
    file =open(src_filename, encoding=encoding)
    items = []
    for line in file:
        items.append(json.loads(line))
    file.close()
    return items


datastore = read_jsonline(path)
print(datastore[1])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
{'postag': [{'word': '《', 'pos': 'w'}, {'word': '中国风水十讲', 'pos': 'nw'}, {'word': '》', 'pos': 'w'}, {'word': '是', 'pos': 'v'}, {'word': '2007年', 'pos': 't'}, {'word': '华夏出版社', 'pos': 'nt'}, {'word': '出版', 'pos': 'v'}, {'word': '的', 'pos': 'u'}, {'word': '图书', 'pos': 'n'}, {'word': ',', 'pos': 'w'}, {'word': '作者', 'pos': 'n'}, {'word': '是', 'pos': 'v'}, {'word': '杨文衡', 'pos': 'nr'}], 'text': '《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡', 'spo_list': [{'predicate': '出版社', 'object_type': '出版社', 'subject_type': '书籍', 'object': '华夏出版社', 'subject': '中国风水十讲'}, {'predicate': '作者', 'object_type': '人物', 'subject_type': '图书作品', 'object': '杨文衡', 'subject': '中国风水十讲'}]}
  • 1
train_data = []
chars={}
with open(r'D:\学习·\自然语言处理\数据集\DuIE_2_0\2019_data\train_data.json','r',encoding='utf-8') as f:
    for line in f.readlines():
        datastore = json.loads(line)
        train_data.append(
            {
                'text': datastore['text'],
                 'spo_list': [(i['subject'], i['predicate'], i['object']) for i in datastore['spo_list']]
            }
        )
        for c in datastore['text']:
                chars[c] = chars.get(c, 0) + 1

print(train_data[1:3])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
[{'text': '《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡', 'spo_list': [('中国风水十讲', '出版社', '华夏出版社'), ('中国风水十讲', '作者', '杨文衡')]}, {'text': '《空城未央》是夙言以信创作的网络小说,发表于17K小说网', 'spo_list': [('空城未央', '作者', '夙言以信'), ('空城未央', '连载网站', '17K小说网')]}]
  • 1

写函数

def write_jsonline(dest_filename, items, encoding='utf-8'):
    """write items to file with json line format
    :param dest_filename: destination file path
    :param items: items to be saved line by line
    :param encoding: file encoding
    :return:
    """
    if isinstance(items, str):
        raise TypeError('json object list can\'t be str')

    if not dest_filename.endswith('.jsonl'):
        print('json line filename doesn\'t end with .jsonl')

    if not isinstance(items, Iterable):
        raise TypeError('items can\'t be iterable')

    file = open(dest_filename, "w",encoding = encoding)
    for item in items:
        file.write(json.dumps(item, ensure_ascii = False) + '\n')
    file.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

参考

https://blog.csdn.net/xyz1584172808/article/details/82117220

https://wuwt.me/2017/08/21/pre-trained-embedding-keras/

https://www.eliyar.biz/using-pre-trained-gensim-word2vector-in-a-keras-model-and-visualizing/

https://m.imooc.com/article/295512

https://radimrehurek.com/gensim/models/word2vec.html

https://blog.csdn.net/lilong117194/article/details/82849054

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/351648?site
推荐阅读
相关标签
  

闽ICP备14008679号