当前位置:   article > 正文

使用Diamond比对NR数据库获取物种注释_diamond nr

diamond nr

 之前用Kraken2注释宏基因组的contig,发现只有30%左右可以被Kraken2注释

Kraken2+Bracken:宏基因组物种注释-CSDN博客

不信邪,再用NR库试试 

参考:

将NR数据库diamond比对结果做物种注释_diamond 物种注释-CSDN博客

NR下载

  1. nohup wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz &
  2. wget -c https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5
  3. #检查下载的完整性(一样的话就是完整了)
  4. md5sum nr.gz ### 899ac219cac213c60fede9c3d9ef8f7b nr.gz
  5. cat nr.gz.md5 ### 899ac219cac213c60fede9c3d9ef8f7b nr.gz
  6. nohup gunzip nr.gz &
  7. mv nr nr.fa
  8. ###下载NR物种相关信息和taxid信息
  9. wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz
  10. wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5
  11. wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
  12. wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5
  13. #检查完整性
  14. md5sum prot.accession2taxid.gz #32b8ca7ea712e161c72af69135fc938e
  15. cat prot.accession2taxid.gz.md5 #32b8ca7ea712e161c72af69135fc938e
  16. md5sum taxdump.tar.gz #26558b800bc1b3795c25d1f0ead65412
  17. cat taxdump.tar.gz.md5 #26558b800bc1b3795c25d1f0ead65412
  18. #解压
  19. tar -zxvf taxdump.tar.gz
  20. gunzip prot.accession2taxid.gz
  21. #移动需要的文件
  22. cd ~
  23. mkdir .taxonkit
  24. cp *.dmp ~/.taxonkit

 必要软件下载

  1. conda create -n NR_database_search
  2. conda activate NR_database_search
  3. conda install -c bioconda taxonkit=0.15.0
  4. conda install -c bioconda diamond=2.1.8
  5. conda install -c bioconda csvtk=0.28.0

diamond建库 

nohup diamond makedb --threads 180 --in nr.fa --db NR_2023_07_23 &

diamond比对

  1. diamond blastx --db NR_2023_07_23.dmnd --query nucleic_reads.fna\
  2. -o nucleic_matches_fmt6.txt --threads 180 --evalue 0.00001 \
  3. --max-target-seqs 5 --outfmt 6
  4. diamond blastp --db NR_2023_07_23.dmnd --query protein_reads.fna\
  5. -o protein_matches_fmt6.txt --threads 180 --evalue 0.00001 \
  6. --max-target-seqs 5 --outfmt 6
  7. ## --outfmt 6 最好别改变

 这些参数可以调整diamond比对的速度or准确性

 这几个参数可以调整比对的coverage,identity,score

结果如下 !!!(这个表头后面python会加)

taxonkit获得物种分类信息表

感谢大佬:一文完成nt库序列快速下载及blast结果注释物种 (qq.com)

得到seqid注释之后,可以搜索注释

  1. ## 一些主要的物种编号
  2. # 2 bacteria
  3. # 2157 archaea
  4. # 4751 fungi
  5. # 10239 virus
  6. # 2759 Eukaryota
  7. #看taxnokit安好了么
  8. taxonkit -h
  9. #创建目录
  10. cd /home/zhongpei/database/NR_2023_07_23
  11. mkdir /home/zhongpei/database/NR_2023_07_23/NCBI_Main_tax
  12. cd NCBI_Main_tax
  13. #开始
  14. taxonkit list -j 4 --ids 2,2157,4751,10239,2759 --indent "" > NCBI_Main.taxid.txt
  15. # -j 是线程,软件说4个够了;--ids 是需要的物种编号,用逗号分隔
  16. wc -l NCBI_Main.taxid.txt # 2708739 NCBI_Main.taxid.txt
  17. head -n 5 NCBI_Main.taxid.txt #查看内容
  18. # 提取taxid和taxonomy(界门纲目科属种)的对应信息到NCBI_Main.taxid.txt
  19. less NCBI_Main.taxid.txt | taxonkit reformat -I 1 -r Unassigned -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}"| sed '1i\Taxid\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\tStrain' > NCBI_Main.taxid_new.txt
  20. # -I 1 一个制表符分隔;-r 没有找到的用什么字符去填充,这里用的“Unassigned”
  21. # -f 输出的格式;1i表示在第行之前插入文本(sed用法,不太会)

完成! 

vim NCBI_Main.taxid_new.txt

把第一个Taxid改成小写

seqid和taxid的对应

还记不记得第一步下载过一个 "prot.accession2taxid" ,现在要派上用场了

其实python也能做,csvtk太不熟悉了,先来学习一下吧,感觉还挺方便(这一步比较慢)

  1. cat prot.accession2taxid | csvtk -t grep -f taxid -P NCBI_Main.taxid.txt | csvtk -t cut -f accession.version,taxid > NCBI_seqid_taxid.txt
  2. # "cat prot.accession2taxid |" 把 prot.accession2taxid 的内容到下面的 csvtk
  3. # -t 输入内容是制表符分隔;grep 这是csvtk的1个子命令,用于在文件中搜索匹配的行
  4. # -P 搜索那些"taxid"字段的值出现在"NCBI_Main.taxid.txt"文件中的行
  5. # cut 这是csvtk的1个子命令,用于从输入中选择特定的字段

NCBI_seqid_taxid.txt 就是目标文件

diamond seqid和taxid对应,再和界门纲目科属种对应

 把diamond结果文件与NCBI_seqid_taxid.txt对应

  1. #!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
  2. # ##########################################################
  3. # match diamond blast NR result.txt and NCBI_seqid_taxid.txt
  4. # written by PeiZhong in IFR of CAAS
  5. import os
  6. import argparse
  7. parser = argparse.ArgumentParser(description='match diamond blast NR result.txt and NCBI_seqid_taxid.txt. '
  8. '!!! all the file should end will .txt !!!')
  9. parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your diamond result txt')
  10. parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
  11. 'your two-column diamond results file in this folder, e.g., clean, '
  12. 'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
  13. parser.add_argument('NCBI_acc_taxid',help='full Path that contain NCBI_seqid_taxid.txt')
  14. args = parser.parse_args()
  15. result_file_folder_path = args.diamond_result_folder_path
  16. NCBI_file = args.NCBI_acc_taxid
  17. file_mark = args.result_files_mark
  18. os.chdir(result_file_folder_path)
  19. files = os.listdir(result_file_folder_path)
  20. print(files)
  21. db={}
  22. print("start db read")
  23. f_table = open("%s" % (NCBI_file), 'r')
  24. print("start db build")
  25. for line in f_table.readlines():
  26. line=line.split('\t')
  27. acc_num = line[0].strip()
  28. tax_num = line[1].strip()
  29. db[acc_num] = tax_num
  30. print("finish db build")
  31. file_ls = []
  32. for result_file in files:
  33. if file_mark in result_file:
  34. file_ls.append(result_file)
  35. file_ls.sort()
  36. print(file_ls)
  37. header = "qseqid\taccession.version\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
  38. for result_file in file_ls:
  39. print(result_file)
  40. f_result = open("%s" % (result_file), 'r+')
  41. content = f_result.read()
  42. f_result.seek(0, 0)
  43. f_result.write(header.rstrip('\r\n') + '\n' + content)
  44. f_result.close()
  45. out_name = str(result_file).strip('txt')
  46. out_name = out_name+'_taxid.txt'
  47. f_result = open("%s" % (result_file), 'r')
  48. f_out = open("%s" % (out_name), 'a')
  49. for line in f_result.readlines():
  50. line = line.split('\t')
  51. query_num = line[0].strip()
  52. acc_q_num = line[1].strip()
  53. if acc_q_num in db:
  54. print(query_num,end="\t",file=f_out)
  55. print(acc_q_num, end="\t", file=f_out)
  56. print(db[acc_q_num], file=f_out)
  57. f_out.close()
  58. f_result.close()
  59. f_table.close()

wAAACH5BAEKAAAALAAAAAABAAEAAAICRAEAOw==

  1. chmod +x diamond_NR_tax.py(完整地址)
  2. diamond_NR_tax.py(完整地址) -h
  3. diamond_NR_tax.py(完整地址) diamond_result_folder_path result_files_mark NCBI_acc_taxid
  4. diamond_result_folder_path:你存放上面处理完的diamond比对文件,txt结尾的文件,的目录(完整地址)
  5. result_files_mark:这个地址中,你的这些文件独有的标识字符串
  6. #for example,the mark of result file 'SY10_clean.txt' and 'SY11_clean.txt' is 'clean'
  7. NCBI_acc_taxid:你的NCBI_seqid_taxid.txt文件的完整地址

wAAACH5BAEKAAAALAAAAAABAAEAAAICRAEAOw==

做完结果是这样的 

接下来再写个python代码根据taxid把这个文件和界门纲目科属种联系起来就行(不好意思只会python,我检讨。。。。但是python简单呀)

  1. #!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
  2. # ##########################################################
  3. # match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt
  4. # written by PeiZhong in IFR of CAAS
  5. import os
  6. import argparse
  7. parser = argparse.ArgumentParser(description='match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt. '
  8. '!!! all the file should end will .txt !!!')
  9. parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your result txt')
  10. parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
  11. 'your two-column diamond results file in this folder, e.g., clean, '
  12. 'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
  13. parser.add_argument('NCBI_taxid_tax',help='full Path that contain NCBI_Main.taxid_new.txt')
  14. args = parser.parse_args()
  15. result_file_folder_path = args.diamond_result_folder_path
  16. NCBI_file = args.NCBI_taxid_tax
  17. file_mark = args.result_files_mark
  18. os.chdir(result_file_folder_path)
  19. files = os.listdir(result_file_folder_path)
  20. print(files)
  21. db={}
  22. print("start db read")
  23. f_table = open("%s" % (NCBI_file), 'r')
  24. print("start db build")
  25. for line in f_table.readlines():
  26. line=line.split('\t')
  27. taxid = line[0].strip()
  28. tax_anno = line[1:8]
  29. db[taxid] = tax_anno
  30. print("finish db build")
  31. file_ls = []
  32. for result_file in files:
  33. if file_mark in result_file:
  34. file_ls.append(result_file)
  35. file_ls.sort()
  36. print(file_ls)
  37. for result_file in file_ls:
  38. print(result_file)
  39. out_name = str(result_file).strip('txt')
  40. out_name = out_name+'_tax.txt'
  41. f_result = open("%s" % (result_file), 'r')
  42. f_out = open("%s" % (out_name), 'a')
  43. for line in f_result.readlines():
  44. line = line.split('\t')
  45. query_num = line[0].strip()
  46. acc_q_num = line[1].strip()
  47. taxid_1 = line[2].strip()
  48. if taxid_1 in db:
  49. print(query_num,end="\t",file=f_out)
  50. print(acc_q_num, end="\t", file=f_out)
  51. print(taxid_1, end="\t", file=f_out)
  52. tax_in_db = db[taxid_1]
  53. str_ls = map(str, tax_in_db)
  54. tax = '\t'.join(str_ls)
  55. print(tax, file=f_out)
  56. f_out.close()
  57. f_result.close()
  58. f_table.close()

wAAACH5BAEKAAAALAAAAAABAAEAAAICRAEAOw==

和上面一样也需要给权限

好了,可以交差了 !我宣布python是我们初学者的yyds

再把多个样本的结果结合到一起成为表格 

  1. #!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
  2. # ##########################################################
  3. # match diamond blast NR result.txt and NCBI_seqid_taxid.txt
  4. # written by PeiZhong in IFR of CAAS
  5. import argparse
  6. import os
  7. import pandas as pd
  8. import csv
  9. parser = argparse.ArgumentParser(description='tax files combine')
  10. parser.add_argument('tax_result_folder_path',help='full Path of the folder that contain your tax result txt')
  11. parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
  12. 'your two-column diamond results file in this folder, e.g., clean, '
  13. 'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
  14. args = parser.parse_args()
  15. result_file_folder_path = args.tax_result_folder_path
  16. file_mark = args.result_files_mark
  17. path = os.chdir(result_file_folder_path)
  18. files = os.listdir(result_file_folder_path)
  19. file_ls = []
  20. for file in files:
  21. if file_mark in file:
  22. file_ls.append(file)
  23. file_ls.sort()
  24. print(file_ls)
  25. for file in file_ls:
  26. df = pd.read_csv('%s' % (file),sep='\t')
  27. df.drop_duplicates(subset='qseqid', keep='first', inplace=True)
  28. outname = str(file).rsplit(".", 1)[0]
  29. df.to_csv(outname+'_only1.txt', index=False, sep='\t')
  30. path = os.chdir(result_file_folder_path)
  31. files = os.listdir(result_file_folder_path)
  32. file2_ls = []
  33. for file2 in files:
  34. if "only1" in file2:
  35. file2_ls.append(file2)
  36. file2_ls.sort()
  37. print(file2_ls)
  38. def tax_finder(file):
  39. tax_ls = []
  40. with open(file, 'r') as f:
  41. for line in f.readlines():
  42. if "qseqid" not in line:
  43. tax = ';'.join(line.split('\t')[3:9])
  44. tax_ls.append(tax)
  45. db = {}
  46. for i in tax_ls:
  47. if i in db:
  48. db[i] += 1
  49. if i not in db:
  50. db[i] = 1
  51. output = str(file).rsplit(".",1)[0]+"_count1.txt"
  52. with open(output,"a") as f3:
  53. for key, value in db.items():
  54. key = key.strip()
  55. print(key, end="\t", file=f3)
  56. print(value, file=f3)
  57. for file in file2_ls:
  58. tax_finder(file)
  59. path = os.chdir(result_file_folder_path)
  60. files = os.listdir(result_file_folder_path)
  61. file3_ls = []
  62. for file3 in files:
  63. if "count1" in file3:
  64. file3_ls.append(file3)
  65. file3_ls.sort()
  66. print(file3_ls)
  67. all_tax = {}
  68. for file in file3_ls:
  69. with open("%s" % (file),"r") as f4:
  70. for line in f4.readlines():
  71. tax = line.split('\t')[0]
  72. all_tax[tax] = 0
  73. print(all_tax)
  74. ls_db_result = []
  75. for file in file3_ls:
  76. with open("%s" % (file),"r") as f5:
  77. db_name = str(file)
  78. db_name = all_tax.copy()
  79. for line in f5.readlines():
  80. tax = line.split('\t')[0]
  81. count = line.strip("\n").split('\t')[1]
  82. if tax in db_name:
  83. db_name[tax] = count
  84. ls_db_result.append(db_name)
  85. header=[]
  86. for key in all_tax.keys():
  87. header.append(key)
  88. with open('merge_overview_2.csv', 'a', encoding='utf-8', newline='') as f7:
  89. dictWriter = csv.DictWriter(f7,header)
  90. dictWriter.writeheader()
  91. dictWriter.writerows(ls_db_result)
  92. f7_rows=[]
  93. with open('merge_overview_2.csv',"r") as f7:
  94. for line in f7.readlines():
  95. f7_rows.append(line)
  96. with open('merge_overview_3.csv',"a") as f8:
  97. print("",end=",",file=f8)
  98. print(f7_rows[0].strip("\n"),file=f8)
  99. row_count=0
  100. for i in file3_ls:
  101. row_count += 1
  102. print(i,end=",",file=f8)
  103. print(f7_rows[row_count].strip("\n"),file=f8)

  

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/453754
推荐阅读
相关标签
  

闽ICP备14008679号