如何获得cafe所需要的过滤后的基因家族基因数目文件

Orthofinder产生的Orthogroups.GeneCount.tsv不能直接作为CAFE软件的输入文件,需要做如下处理:
filter_large_gene_family.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
    作者:徐诗芬
    内容:1.cafe的输入格式:去掉total这一列,在第一列添加Description,然后这一列都为空
         2.过滤掉一个或多个物种有超过200个基因拷贝的基因家族,size=200
    日期:2021.2.3
"""
import sys
from itertools import islice

def usage():
    print('Usage: python filter_large_gene_family.py [index_file] [size]')


def main():
    ouf1 = open("cafe_gene_family_format.txt", 'wt')
    ouf2 = open("cafe_large_gene_family_format.txt", 'wt')

    size = eval(sys.argv[2])
    with open(sys.argv[1], 'rt') as f:
        for head in islice(f, 0, 1):
            head = head.strip().split("\t")[:-1]
            ouf1.write("Description\t")
            ouf2.write("Description\t")
            for h in head:
                ouf1.write(h + "\t")
                ouf2.write(h + "\t")
            ouf1.write("\n")
            ouf2.write("\n")
        for line in islice(f, 1, None):  # 跳过第一行按行读取文件内容
            gene_count = line.strip().split("\t")[:-1]
            family_name = gene_count[0]
            gene_list = list(map(int, gene_count[1:]))
            x = [i for i in gene_list if i > size]  # 从列表中找出大于某个值的元素
            if len(x) > 0:
                ouf2.write("\t")
                for n in gene_count:
                    ouf2.write(n)
                    ouf2.write("\t")
                ouf2.write("\n")
            else:
                ouf1.write("\t")
                for n in gene_count:
                    ouf1.write(n)
                    ouf1.write("\t")
                ouf1.write("\n")



    ouf1.close()
    ouf2.close()


try:
    main()
except IndexError:
    usage()

推荐阅读更多精彩内容