赞
踩
- txt1='''Hi! This is Wang.
- Hello! It's Sun speaking.
- Do you have free time this evening?
- Uh... Let's me see. What's the matter?
- I hope you can see a new movie Super Hero with me. I've been wanting to see it for a long time.
- Sorry, I didn't catch that. Could you say the name again?
- Super Hero.
- Oh! Super Hero. I'm very interested in it. I haven't seen it. I can go with you. When the movie start?
- It begin at seven o'clock. Let's gather at the cinema at ten to seven.
- Ok, I'll arrive there on time. Goodbye!
- Bye.'''
- txt2="Hi! Wang. I've arrive the cinema. Where are you?"
-
- #自定义文本预处理函数
- def txtpre():
- global txt#txt为全局变量
- txt=txt.lower()
- #变特殊字符为空格
- for ch in "!,.?":
- txt=txt.replace(ch," ")
- #对于缩写,通过空格按要求分隔
- txt=txt.replace("it's","it 's")
- txt=txt.replace("i've","i 've")
- txt=txt.replace("don't","do n't")
- txt=txt.replace("i'll","i 'll")
- txt=txt.replace("i'd","i 'd")
-
- #将txt1赋给txt,作预处理
- txt=txt1
- txtpre()
- list1=list(txt.split())#将字符串按空格分隔转换成列表
- dict1=dict.fromkeys(list1,0)#创建一个新字典,默认键对应的值为0
-
- #记录词频到字典的值当中,避免重复词(键)
- for x in list1:
- dict1[x]+=1
-
- #将词(键)组合成列表,并添加"UNKNOWN"为列表的最后一个元素
- key_lst=[]
- for k in range(len(dict1)):
- key_lst=list(dict1.keys())
- key_lst.append("UNKNOWN")
-
- char_to_int = dict((c, i) for i, c in enumerate(key_lst))#词转化为编号需要用到的数据类型
- int_to_char = dict((i, c) for i, c in enumerate(key_lst))#编号转化为词需要用到的数据类型
- print(int_to_char)#以“编号 词”的方式输出词典
-
- #将txt2赋给txt,作预处理
- txt=txt2
- txtpre()
- list2=list(txt.split())
-
- integer_encoded = []#编号组合成的整数矩阵
- #依次检索表2元素,如果在key_lst中,加编号到整数矩阵中,否则加"UNKONWN"的编号到矩阵中
- for char in list2:
- if (char in key_lst):
- integer_encoded.append(char_to_int[char])
- else:
- integer_encoded.append(key_lst.index("UNKNOWN"))
-
- #构成onehot形式并输出,编码的过程
- onehot_encoded =[]
- for value in integer_encoded:
- letter = [0 for _ in range(len(key_lst))]
- letter[value] = 1
- onehot_encoded.append(letter)
- print(onehot_encoded)
-
- #解码,并输出由词组合成的列表
- list_decode=[]
- for i in range(len(onehot_encoded)):
- decode = int_to_char[integer_encoded[onehot_encoded.index(onehot_encoded[i])]]#逐步往回推
- list_decode.append(decode)
- print(list_decode)
运行结果:
- {0: 'hi', 1: 'this', 2: 'is', 3: 'wang', 4: 'hello', 5: 'it', 6: "'s", 7: 'sun', 8: 'speaking', 9: 'do', 10: 'you', 11: 'have', 12: 'free', 13: 'time', 14: 'evening', 15: 'uh', 16: "let's", 17: 'me', 18: 'see', 19: "what's", 20: 'the', 21: 'matter', 22: 'i', 23: 'hope', 24: 'can', 25: 'a', 26: 'new', 27: 'movie', 28: 'super', 29: 'hero', 30: 'with', 31: "'ve", 32: 'been', 33: 'wanting', 34: 'to', 35: 'for', 36: 'long', 37: 'sorry', 38: "didn't", 39: 'catch', 40: 'that', 41: 'could', 42: 'say', 43: 'name', 44: 'again', 45: 'oh', 46: "i'm", 47: 'very', 48: 'interested', 49: 'in', 50: "haven't", 51: 'seen', 52: 'go', 53: 'when', 54: 'start', 55: 'begin', 56: 'at', 57: 'seven', 58: "o'clock", 59: 'gather', 60: 'cinema', 61: 'ten', 62: 'ok', 63: "'ll", 64: 'arrive', 65: 'there', 66: 'on', 67: 'goodbye', 68: 'bye', 69: 'UNKNOWN'}
- [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
- ['hi', 'wang', 'i', "'ve", 'arrive', 'the', 'cinema', 'UNKNOWN', 'UNKNOWN', 'you']
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。