当前位置:   article > 正文

数据处理入门:自然语言处理NLP常用文本预处理操作整理_文本数据预处理

文本数据预处理

一 文件读写

1 读取csv文件

import csv
with open(path,"r",encoding="utf-8") as f:
    reader=csv.reader(f) #csv阅读器 默认分隔符为"," 设置分隔符用delimiter
    #reader=csv.reader(f,delimiter="\t")
    birth_header=next(reader) #获取首行标签
    for row in reader: #遍历文本
        	...
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

2 写入csv文件

import csv
f=open("data.csv","w",encoding="utf-8",newline="")
f_writer=csv.writer(f,delimiter=" ") #delimiter记录分隔符
f_writer.writerow(["label","text"]) #列表形式,写入首行标签
for line in lines:
	f_writer.writerow(line) #写入数据
        	...
f.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

3 写入jsonl文件

import jsonlines
with open("data.jsonl","w",encoding="utf-8") as f:
    pass
with jsonlines.open("data.jsonl",mode="a") as f:
	f.write(input_line) #input_line为字典格式
        	...
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

or

import json
with open("data.jsonl","w",encoding="utf-8") as f:
	f.write(json.dumps(input_line)+"\n") #input_line为字典格式
        	...
  • 1
  • 2
  • 3
  • 4

4 读取jsonl文件

import jsonlines
with open("data.jsonl","r") as f:
    lines=jsonlines.Reader(f)
    for line in lines: #line为字典格式
        	...
  • 1
  • 2
  • 3
  • 4
  • 5

or

import json
with open("data.jsonl","r") as f:
    lines=f.readlines()
    for line in lines:
        line=json.loads(line) #将数据转换为字典格式
        	...
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

5 读取xlsx文件

读取一个xlsx文件中的所有sheet:

import pandas as pd

def read_all_sheets(file_path):
    try:
        xls = pd.ExcelFile(file_path)
        sheet_names = xls.sheet_names

        # 创建一个字典来存储每个sheet的数据
        all_data = {}

        for sheet_name in sheet_names:
            # 读取每个sheet的数据并存储到字典中
            all_data[sheet_name] = xls.parse(sheet_name)
        
        return all_data

    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
        return None
    except Exception as e:
        print("An error occurred:", e)
        return None

# 指定xlsx文件路径并调用函数读取数据
file_path = 'your_file.xlsx'  # 替换成你的文件路径
data = read_all_sheets(file_path)

if data:
    # 打印读取的数据
    for sheet_name, df in data.items():
        print(f"Sheet: {sheet_name}")
        print(df) #每个sheet中的数据的dataframe格式, 转化为列表用df.values.tolist()
        print("=====================")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33

6 写入xlsx文件

  • 只写一个sheet的情况
import pandas as pd

# 示例列表
data = [
    ['Alice', 25, 'New York'],
    ['Bob', 30, 'San Francisco'],
    ['Charlie', 35, 'Seattle']
] # 要写入的数据列表

# 将列表转换为 DataFrame,并在columns中设置列名
df = pd.DataFrame(data, columns=['Name', 'Age', 'City'])

# 指定要写入的文件路径
file_path = 'output.xlsx'

# 将 DataFrame 写入到 xlsx 文件中
df.to_excel(file_path, index=False)

print(f"Data has been written to {file_path}")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 写多个sheet的情况
import pandas as pd

# 两个示例列表
data1 = [
    ['Alice', 25, 'New York'],
    ['Bob', 30, 'San Francisco'],
    ['Charlie', 35, 'Seattle']
]

data2 = [
    ['David', 28, 'Los Angeles'],
    ['Eva', 32, 'Chicago'],
    ['Frank', 29, 'Houston']
]

# 将列表转换为 DataFrame
df1 = pd.DataFrame(data1, columns=['Name', 'Age', 'City'])
df2 = pd.DataFrame(data2, columns=['Name', 'Age', 'City'])

# 指定要写入的文件路径
file_path = 'output.xlsx'

# 创建 ExcelWriter 对象
with pd.ExcelWriter(file_path) as writer:
    # 将每个 DataFrame 分别写入不同的 sheet
    df1.to_excel(writer, sheet_name='Sheet1', index=False)
    df2.to_excel(writer, sheet_name='Sheet2', index=False)

print(f"Data has been written to {file_path}")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29

7 将列表格式的字符串转换为列表

import ast
with open("data.txt","r",encoding="utf-8") as f:
    lines=f.readlines()
    for line in lines: #line: "["id","text","label"]"
        line=ast.literal_eval(line) #line: ["id","text","label"]
        img=line[0]
		...
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

二 数据处理

1 文本处理

新增:停用词汇总

# 中文停用词
["、","。","〈","〉","《","》","一","一个","一些","一何","一切","一则","一方面","一旦","一来","一样","一种","一般","一转眼","七","万一","三","上","上下","下","不","不仅","不但","不光","不单","不只","不外乎","不如","不妨","不尽","不尽然","不得","不怕","不惟","不成","不拘","不料","不是","不比","不然","不特","不独","不管","不至于","不若","不论","不过","不问","与","与其","与其说","与否","与此同时","且","且不说","且说","两者","个","个别","中","临","为","为了","为什么","为何","为止","为此","为着","乃","乃至","乃至于","么","之","之一","之所以","之类","乌乎","乎","乘","九","也","也好","也罢","了","二","二来","于","于是","于是乎","云云","云尔","五","些","亦","人","人们","人家","什","什么","什么样","今","介于","仍","仍旧","从","从此","从而","他","他人","他们","他们们","以","以上","以为","以便","以免","以及","以故","以期","以来","以至","以至于","以致","们","任","任何","任凭","会","似的","但","但凡","但是","何","何以","何况","何处","何时","余外","作为","你","你们","使","使得","例如","依","依据","依照","便于","俺","俺们","倘","倘使","倘或","倘然","倘若","借","借傥然","假使","假如","假若","做","像","儿","先不先","光","光是","全体","全部","八","六","兮","共","关于","关于具体地说","其","其一","其中","其二","其他","其余","其它","其次","具体地说","具体说来","兼之","内","再","再其次","再则","再有","再者","再者说","再说","冒","冲","况且","几","几时","凡","凡是","凭","凭借","出于","出来","分","分别","则","则甚","别","别人","别处","别是","别的","别管","别说","到","前后","前此","前者","加之","加以","区","即","即令","即使","即便","即如","即或","即若","却","去","又","又及","及","及其","及至","反之","反而","反过来","反过来说","受到","另","另一方面","另外","另悉","只","只当","只怕","只是","只有","只消","只要","只限","叫","叮咚","可","可以","可是","可见","各","各个","各位","各种","各自","同","同时","后","后者","向","向使","向着","吓","吗","否则","吧","吧哒","含","吱","呀","呃","呕","呗","呜","呜呼","呢","呵","呵呵","呸","呼哧","咋","和","咚","咦","咧","咱","咱们","咳","哇","哈","哈哈","哉","哎","哎呀","哎哟","哗","哟","哦","哩","哪","哪个","哪些","哪儿","哪天","哪年","哪怕","哪样","哪边","哪里","哼","哼唷","唉","唯有","啊","啐","啥","啦","啪达","啷当","喂","喏","喔唷","喽","嗡","嗡嗡","嗬","嗯","嗳","嘎","嘎登","嘘","嘛","嘻","嘿","嘿嘿","四","因","因为","因了","因此","因着","因而","固然","在","在下","在于","地","基于","处在","多","多么","多少","大","大家","她","她们","好","如","如上","如上所述","如下","如何","如其","如同","如是","如果","如此","如若","始而","孰料","孰知","宁","宁可","宁愿","宁肯","它","它们","对","对于","对待","对方","对比","将","小","尔","尔后","尔尔","尚且","就","就是","就是了","就是说","就算","就要","尽","尽管","尽管如此","岂但","己","已","已矣","巴","巴巴","年","并","并且","庶乎","庶几","开外","开始","归","归齐","当","当地","当然","当着","彼","彼时","彼此","往","待","很","得","得了","怎","怎么","怎么办","怎么样","怎奈","怎样","总之","总的来看","总的来说","总的说来","总而言之","恰恰相反","您","惟其","慢说","我","我们","或","或则","或是","或曰","或者","截至","所","所以","所在","所幸","所有","才","才能","打","打从","把","抑或","拿","按","按照","换句话说","换言之","据","据此","接着","故","故此","故而","旁人","无","无宁","无论","既","既往","既是","既然","日","时","时候","是","是以","是的","更","曾","替","替代","最","月","有","有些","有关","有及","有时","有的","望","朝","朝着","本","本人","本地","本着","本身","来","来着","来自","来说","极了","果然","果真","某","某个","某些","某某","根据","欤","正值","正如","正巧","正是","此","此地","此处","此外","此时","此次","此间","毋宁","每","每当","比","比及","比如","比方","没奈何","沿","沿着","漫说","点","焉","然则","然后","然而","照","照着","犹且","犹自","甚且","甚么","甚或","甚而","甚至","甚至于","用","用来","由","由于","由是","由此","由此可见","的","的确","的话","直到","相对而言","省得","看","眨眼","着","着呢","矣","矣乎","矣哉","离","秒","称","竟而","第","等","等到","等等","简言之","管","类如","紧接着","纵","纵令","纵使","纵然","经","经过","结果","给","继之","继后","继而","综上所述","罢了","者","而","而且","而况","而后","而外","而已","而是","而言","能","能否","腾","自","自个儿","自从","自各儿","自后","自家","自己","自打","自身","至","至于","至今","至若","致","般的","若","若夫","若是","若果","若非","莫不然","莫如","莫若","虽","虽则","虽然","虽说","被","要","要不","要不是","要不然","要么","要是","譬喻","譬如","让","许多","论","设使","设或","设若","诚如","诚然","该","说","说来","请","诸","诸位","诸如","谁","谁人","谁料","谁知","贼死","赖以","赶","起","起见","趁","趁着","越是","距","跟","较","较之","边","过","还","还是","还有","还要","这","这一来","这个","这么","这么些","这么样","这么点儿","这些","这会儿","这儿","这就是说","这时","这样","这次","这般","这边","这里","进而","连","连同","逐步","通过","遵循","遵照","那","那个","那么","那么些","那么样","那些","那会儿","那儿","那时","那样","那般","那边","那里","都","鄙人","鉴于","针对","阿","除","除了","除外","除开","除此之外","除非","随","随后","随时","随着","难道说","零","非","非但","非徒","非特","非独","靠","顺","顺着","首先","︿","!","#","$","%","&","(",")","*","+",",","0","1","2","3","4","5","6","7","8","9",":",";","<",">","?","@","[","]","{","|","}","~","¥"]
  • 1
  • 2
# 英文停用词
["'ll","'tis","'twas","'ve","10","39","a","a's","able","ableabout","about","above","abroad","abst","accordance","according","accordingly","across","act","actually","ad","added","adj","adopted","ae","af","affected","affecting","affects","after","afterwards","ag","again","against","ago","ah","ahead","ai","ain't","aint","al","all","allow","allows","almost","alone","along","alongside","already","also","although","always","am","amid","amidst","among","amongst","amoungst","amount","an","and","announce","another","any","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","ao","apart","apparently","appear","appreciate","appropriate","approximately","aq","ar","are","area","areas","aren","aren't","arent","arise","around","arpa","as","aside","ask","asked","asking","asks","associated","at","au","auth","available","aw","away","awfully","az","b","ba","back","backed","backing","backs","backward","backwards","bb","bd","be","became","because","become","becomes","becoming","been","before","beforehand","began","begin","beginning","beginnings","begins","behind","being","beings","believe","below","beside","besides","best","better","between","beyond","bf","bg","bh","bi","big","bill","billion","biol","bj","bm","bn","bo","both","bottom","br","brief","briefly","bs","bt","but","buy","bv","bw","by","bz","c","c'mon","c's","ca","call","came","can","can't","cannot","cant","caption","case","cases","cause","causes","cc","cd","certain","certainly","cf","cg","ch","changes","ci","ck","cl","clear","clearly","click","cm","cmon","cn","co","co.","com","come","comes","computer","con","concerning","consequently","consider","considering","contain","containing","contains","copy","corresponding","could","could've","couldn","couldn't","couldnt","course","cr","cry","cs","cu","currently","cv","cx","cy","cz","d","dare","daren't","darent","date","de","dear","definitely","describe","described","despite","detail","did","didn","didn't","didnt","differ","different","differently","directly","dj","dk","dm","do","does","doesn","doesn't","doesnt","doing","don","don't","done","dont","doubtful","down","downed","downing","downs","downwards","due","during","dz","e","each","early","ec","ed","edu","ee","effect","eg","eh","eight","eighty","either","eleven","else","elsewhere","empty","end","ended","ending","ends","enough","entirely","er","es","especially","et","et-al","etc","even","evenly","ever","evermore","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","face","faces","fact","facts","fairly","far","farther","felt","few","fewer","ff","fi","fifteen","fifth","fifty","fify","fill","find","finds","fire","first","five","fix","fj","fk","fm","fo","followed","following","follows","for","forever","former","formerly","forth","forty","forward","found","four","fr","free","from","front","full","fully","further","furthered","furthering","furthermore","furthers","fx","g","ga","gave","gb","gd","ge","general","generally","get","gets","getting","gf","gg","gh","gi","give","given","gives","giving","gl","gm","gmt","gn","go","goes","going","gone","good","goods","got","gotten","gov","gp","gq","gr","great","greater","greatest","greetings","group","grouped","grouping","groups","gs","gt","gu","gw","gy","h","had","hadn't","hadnt","half","happens","hardly","has","hasn","hasn't","hasnt","have","haven","haven't","havent","having","he","he'd","he'll","he's","hed","hell","hello","help","hence","her","here","here's","hereafter","hereby","herein","heres","hereupon","hers","herself","herse”","hes","hi","hid","high","higher","highest","him","himself","himse”","his","hither","hk","hm","hn","home","homepage","hopefully","how","how'd","how'll","how's","howbeit","however","hr","ht","htm","html","http","hu","hundred","i","i'd","i'll","i'm","i've","i.e.","id","ie","if","ignored","ii","il","ill","im","immediate","immediately","importance","important","in","inasmuch","inc","inc.","indeed","index","indicate","indicated","indicates","information","inner","inside","insofar","instead","int","interest","interested","interesting","interests","into","invention","inward","io","iq","ir","is","isn","isn't","isnt","it","it'd","it'll","it's","itd","itll","its","itself","itse”","ive","j","je","jm","jo","join","jp","just","k","ke","keep","keeps","kept","keys","kg","kh","ki","kind","km","kn","knew","know","known","knows","kp","kr","kw","ky","kz","l","la","large","largely","last","lately","later","latest","latter","latterly","lb","lc","least","length","less","lest","let","let's","lets","li","like","liked","likely","likewise","line","little","lk","ll","long","longer","longest","look","looking","looks","low","lower","lr","ls","lt","ltd","lu","lv","ly","m","ma","made","mainly","make","makes","making","man","many","may","maybe","mayn't","maynt","mc","md","me","mean","means","meantime","meanwhile","member","members","men","merely","mg","mh","microsoft","might","might've","mightn't","mightnt","mil","mill","million","mine","minus","miss","mk","ml","mm","mn","mo","more","moreover","most","mostly","move","mp","mq","mr","mrs","ms","msie","mt","mu","much","mug","must","must've","mustn't","mustnt","mv","mw","mx","my","myself","myse”","mz","n","na","name","namely","nay","nc","nd","ne","near","nearly","necessarily","necessary","need","needed","needing","needn't","neednt","needs","neither","net","netscape","never","neverf","neverless","nevertheless","new","newer","newest","next","nf","ng","ni","nine","ninety","nl","no","no-one","nobody","non","none","nonetheless","noone","nor","normally","nos","not","noted","nothing","notwithstanding","novel","now","nowhere","np","nr","nu","null","number","numbers","nz","o","obtain","obtained","obviously","of","off","often","oh","ok","okay","old","older","oldest","om","omitted","on","once","one","one's","ones","only","onto","open","opened","opening","opens","opposite","or","ord","order","ordered","ordering","orders","org","other","others","otherwise","ought","oughtn't","oughtnt","our","ours","ourselves","out","outside","over","overall","owing","own","p","pa","page","pages","part","parted","particular","particularly","parting","parts","past","pe","per","perhaps","pf","pg","ph","pk","pl","place","placed","places","please","plus","pm","pmid","pn","point","pointed","pointing","points","poorly","possible","possibly","potentially","pp","pr","predominantly","present","presented","presenting","presents","presumably","previously","primarily","probably","problem","problems","promptly","proud","provided","provides","pt","put","puts","pw","py","q","qa","que","quickly","quite","qv","r","ran","rather","rd","re","readily","really","reasonably","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","reserved","respectively","resulted","resulting","results","right","ring","ro","room","rooms","round","ru","run","rw","s","sa","said","same","saw","say","saying","says","sb","sc","sd","se","sec","second","secondly","seconds","section","see","seeing","seem","seemed","seeming","seems","seen","sees","self","selves","sensible","sent","serious","seriously","seven","seventy","several","sg","sh","shall","shan't","shant","she","she'd","she'll","she's","shed","shell","shes","should","should've","shouldn","shouldn't","shouldnt","show","showed","showing","shown","showns","shows","si","side","sides","significant","significantly","similar","similarly","since","sincere","site","six","sixty","sj","sk","sl","slightly","sm","small","smaller","smallest","sn","so","some","somebody","someday","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","sr","st","state","states","still","stop","strongly","su","sub","substantially","successfully","such","sufficiently","suggest","sup","sure","sv","sy","system","sz","t","t's","take","taken","taking","tc","td","tell","ten","tends","test","text","tf","tg","th","than","thank","thanks","thanx","that","that'll","that's","that've","thatll","thats","thatve","the","their","theirs","them","themselves","then","thence","there","there'd","there'll","there're","there's","there've","thereafter","thereby","thered","therefore","therein","therell","thereof","therere","theres","thereto","thereupon","thereve","these","they","they'd","they'll","they're","they've","theyd","theyll","theyre","theyve","thick","thin","thing","things","think","thinks","third","thirty","this","thorough","thoroughly","those","thou","though","thoughh","thought","thoughts","thousand","three","throug","through","throughout","thru","thus","til","till","tip","tis","tj","tk","tm","tn","to","today","together","too","took","top","toward","towards","tp","tr","tried","tries","trillion","truly","try","trying","ts","tt","turn","turned","turning","turns","tv","tw","twas","twelve","twenty","twice","two","tz","u","ua","ug","uk","um","un","under","underneath","undoing","unfortunately","unless","unlike","unlikely","until","unto","up","upon","ups","upwards","us","use","used","useful","usefully","usefulness","uses","using","usually","uucp","uy","uz","v","va","value","various","vc","ve","versus","very","vg","vi","via","viz","vn","vol","vols","vs","vu","w","want","wanted","wanting","wants","was","wasn","wasn't","wasnt","way","ways","we","we'd","we'll","we're","we've","web","webpage","website","wed","welcome","well","wells","went","were","weren","weren't","werent","weve","wf","what","what'd","what'll","what's","what've","whatever","whatll","whats","whatve","when","when'd","when'll","when's","whence","whenever","where","where'd","where'll","where's","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","which","whichever","while","whilst","whim","whither","who","who'd","who'll","who's","whod","whoever","whole","wholl","whom","whomever","whos","whose","why","why'd","why'll","why's","widely","width","will","willing","wish","with","within","without","won","won't","wonder","wont","words","work","worked","working","works","world","would","would've","wouldn","wouldn't","wouldnt","ws","www","x","y","ye","year","years","yes","yet","you","you'd","you'll","you're","you've","youd","youll","young","younger","youngest","your","youre","yours","yourself","yourselves","youve","yt","yu","z","za","zero","zm","zr"]
  • 1
  • 2

新增:对tweet推文的处理方法 (源码地址:https://github.com/VinAIResearch/BERTweet

from emoji import demojize
from nltk.tokenize import TweetTokenizer


tokenizer = TweetTokenizer()


def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())


# if __name__ == "__main__":
#     print(
#         normalizeTweet(
#             "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
#         )
#     )
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59

判断字符串以某固定字符串开头或结尾

#单个字符串开头或结尾
if string.startswith("http"):
        	...
if string.endswith(".jpg"):
        	...   
#多种字符串开头或结尾的情况
if word.startswith(("#",'"',"@","“","\\","'")): #用元组包装
			...	
if word.endswith(("!",".",":",'"',"?",",","\\","”","'")):
			...
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

匹配所有英文字符串(正则表达式)

import re
my_re=re.compile(r'[A-Za-z]',re.S)
lst=re.findall(my_re,word) #返回列表
  • 1
  • 2
  • 3

匹配所有中文字符串(正则表达式)

import re
my_re=re.compile(u'[\u4e00-\u9fa5]')
lst=re.findall(my_re,word) #返回列表
  • 1
  • 2
  • 3

删除emoji表情符

import re
try:
	co=re.compile(u'[\U00010000-\U0010ffff]')
except:
	co=re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
	
token=co.sub("",token) #token为单个字符(单词/表情符)
#处理后 若token为None 则原token为表情符
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

删除英文文本中的标点符号

import re
import string 

def clean_text(s):
    s = s.split()
    s = " ".join(s)
    s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)
    return s
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

使用glove文件生成词向量

import pandas as pd
glove_model=pd.read_table("glove.twitter.27B.100d.txt",sep=" ",index_col=0, header=None, quoting=csv.QUOTE_NONE) #加载glove文件

word=glove_model.loc[word].values #生成词向量
  • 1
  • 2
  • 3
  • 4

将所有文本填充到相同维度

def fill_sentence(embeddings, embedding_dim): 
"""
输入:
embeddings: [sentence.array,sentence.array,...]
embedding_dim: 与embeddings中一致
""" 
    fill_embeddings = []

    length = [len(embedding) for embedding in embeddings] #一个embedding为一个sentence
    max_len = max(length)

    for embedding in embeddings:
        if len(embedding) < max_len:
            fill_zero = np.zeros((max_len - len(embedding), embedding_dim))
            fill_embedding = np.append(embedding, fill_zero)
            fill_embedding = fill_embedding.reshape(-1, embedding_dim) #-1: reshape函数根据另一个参数的维度计算出数组的另外一个shape属性值
            fill_embeddings.append(fill_embedding)
        else:
            fill_embeddings.append(embedding)
    return np.array(fill_embeddings)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

2 目录处理

import os
for root,dirs,files in os.walk(filename): #filename文件夹名
"""
root 表示当前正在访问的文件夹路径
dirs 表示该文件夹下的子目录名list
files 表示该文件夹下的文件list
"""
	for file in files: #遍历文件夹下所有文件名
		new_filepath=os.path.join(root,file) #创建路径
		
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

3 打乱数据集

import random
lines=""
with open("data.jsonl","r",encoding="utf-8") as f:
    lines=f.readlines()

index=[i for i in range(len(lines))] #创建index列表

random.shuffle(index) #打乱index

with open("train.jsonl","w",encoding="utf-8") as f:
    for i in range(0,3251): #3251为自定义数据集样本数
        f.write(lines[index[i]]) #乱序载入数据
...
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

或者

import numpy as np
np.random.seed(1337)
np.random.shuffle(trains)
  • 1
  • 2
  • 3

三 其他

1 过滤器的使用(filter)

过滤掉字符串中不需要的字符:

    def word_filter(token): #定义过滤函数,输入token,返回True时,token保留;否则,token过滤掉(删除)。
        #删除链接
        elif token.startswith("http"):
            return False
        else:
            return True
	sentence=[word for word in filter(word_filter,sentence)]  #其中sentence为单词列表   
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

2 创建文件夹

import os
book="a/"
if not os.path.exists(book): #判断文件夹是否存在
    os.makedir(book) #创建单个文件夹

book="a/b/
if not os.path.exists(book):
    os.makedirs(book) #创建多个嵌套文件夹
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

3 转移文件(将一个文件夹下的数据移到另一个文件夹下)

import os
import shutil
path1="row_path" #原文件夹
path2="path" #目标文件夹

for img in img_list: #假设img_list中为需要移动的文件名
	path=os.path.join(path1,img)
	despath=os.path.join(path2,img)
	shutil.move(path,despath) #文件从地址path变为despath 文件移动
	# shutil.move(path,despath) #文件从地址path变为despath 文件复制
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/557513
推荐阅读
相关标签
  

闽ICP备14008679号