当前位置:   article > 正文

用python读取多种格式文件_pyteomics

pyteomics

Json:

use_time=[]
with open(address,'r') as f: #ubuntu
    mobile = json.load(f)
    calls = mobile["transactions"][0]["calls"]
for call in calls: 
  use_time.append(str(call['use_time']))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

Excel:

rawdata1=open_workbook(address)
rawdata=rawdata1.sheet_by_index(0)
for i in range(1,rawdata.nrows):
    if rawdata.cell(i,date_index).value=="": #跳过空行
        continue
    else:
        if ctype==3:  #若为3,则用datetime模块处理日期
            date1=rawdata.cell(i,date_index).value
            date2 = xldate_as_tuple(date1,0) 
            date3=datetime(*date2)
            if "." in str(rawdata.cell(i,phone_index).value):
                phone1=str(rawdata.cell(i,phone_index).value)[:-2]  
            else:
                phone1=str(rawdata.cell(i,phone_index).value)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
import numpy as np
import xlrd   #使用库函数

workbook = xlrd.open_workbook('C:/users/lenovo/desktop/student_score.xlsx')  #读取路径
sheet = workbook.sheet_by_name('Sheet1')     #读取excel中的第一个sheet

data_name = sheet.col_values(0)    #按列读取,读取第一列
#data_name1 = sheet.row_values(0)  #按行读取,读取第一行
data_st_ID = sheet.col_values(1)
data_st_score = sheet.col_values(2)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

写EXCEL:

Excel_file = xlwt.Workbook() 
sheet = Excel_file.add_sheet('sheet0')
header=[u'号码','日期top1','日期top2','日期top3']
#写入标题行:
for i in range(len(header)):
    sheet.write(0,i,header[i])
#开始按行写入数据:
for i in range(len(phonelist)):
    sheet.write(i+1,0,phonelist[i])
    sheet.write(i+1,1,dic[str(phonelist[i])])
#保存EXCEL:
Excel_file.save("C:/Users/Desktop/100个文件输出xls/"+str(fileName)+".xls")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

CSV:

rawdata=pd.read_csv(address,skip_blank_lines=True) #参数为去除空行
if 'start_time' or 'begin_time'  in rawdata.columns:
    if 'start_time' in rawdata.columns:
        start_time=rawdata['start_time']
    elif 'begin_time' in rawdata.columns:
            start_time=rawdata['begin_time']
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

txt:

rawdata=open(address,'r')
i=0
a=[] #c存放第一行的列名
for line in rawdata:
    if i==1: #默认第二行开始存储通话数据
        a=line.split(',') #逗号作为分隔符
        for j in range(len(a)): #查找指定列名所在的列下标
            if (('-' in str(a[j]))or('/' in str(a[j]))): #判断日期所在列数
                date_index=j #保存日期的列下标
            elif  str(a[j]).isdigit() and len(str(a[j]))>5: #默认全为数字组成的字符串为电话号码
                phone_index=j
            else:
                pass
        break
    else:
        i+=1
i=0
for line in rawdata:#开始转存数据:
    if len(line)<10: #跳过空行
        continue
    data_line=line.split(',') #txt默认以','分隔数据
    if i==0:
        pass #第一行为列名,跳过
        i+=1
    else: #从第二行开始保存数据
        start_time.append(data_line[date_index])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26

mgf

from pyteomics import mgf
from pyteomics import mgf
 
for spectrum in mgf.read("test.mgf"):
    print (spectrum)
    params = spectrum.get('params')
    key = list(params)[3]
    print(key)
    title = params.get('title')
    seq = params.get('seq')
 
print (params)
print (title)
print (seq)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

xml

# -*- coding:gb2312 -*-
# coding = utf-8
from pylab import *
import  xml.dom.minidom
def read_xml():
    dom = xml.dom.minidom.parse('abc.xml')#打开xml文档
    cc=dom.getElementsByTagName('caption')
    list_str = [] #字符串
    for item in cc:
        list_str.append(str(item.firstChild.data))

    bb = dom.getElementsByTagName('maxid')
    list_fig = []
    for item in bb:
        list_fig.append(item.firstChild.data)
    su = list_fig[0].encode("gbk")
    list_fig2 = su.split(",")
    list_fig_num = []
    for i in list_fig2:
        list_fig_num.append(int(i))

    ee = dom.getElementsByTagName('time')
    list_tim = []
    for item in ee:
        list_tim.append(item.firstChild.data)
    sg = list_tim[0].encode("gbk")
    list_time = sg.split(",")

    gg = dom.getElementsByTagName('font_size')
    g1 = []
    for item in gg:
        g1.append(item.firstChild.data)
    su = g1[0].encode("gbk")
    return list_str,list_fig_num,list_time,su
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34

在这里插入图片描述
参考:
https://blog.csdn.net/CrazyTTT/article/details/79663895
https://blog.csdn.net/Mr_Cat123/article/details/84857268
https://blog.csdn.net/Z_shoushow/article/details/83022661
https://blog.csdn.net/yiweiyi329/article/details/78184226

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/596049
推荐阅读
相关标签
  

闽ICP备14008679号