# -*- coding: utf-8 -*-
"""
Description:
After running orthofinder on n fasta file, read the output file 'Orthogroups.csv'
Require a folder 'orthology_based_folder' with this archi:
|-- model_a
-- model_a.sbml
|-- model_b
--model_b.sbml
And the name of the studied organism 'study_id'
1. Read the orthogroups file, extract orthogroups in dict 'all_orthogroups', and all org names
2. In orthology folder search for sbml files 'extension = .sbml'
3. For each models regroup all information in a dict dict_data:
{'study_id': study_id,
'model_id' : model_id,
'sbml_template': path to sbml of model',
'output': path to the output sbml,
'verbose': bool, if true print information
}
The output is by default:
\output_orthofinder_from_'model_id'.sbml
4. Store all previous dict_data in a list all_dict_data
5. iter on dict from all_dict_data and use function dict_data_to_sbml
Use a dict of data dict_data and dict of orthogroups dict_orthogroup to create sbml files.
dict_data and dict_orthogroup are obtained with fun orthofinder_to_sbml
6./ Read dict_orthogroups and check if model associated to dict_data and study org share orthologue
7./ Read sbml of model, parse all reactions and get genes associated to reaction.
8./ For each reactions:
Parse genes associated to sub part (ex: (gene-a and gene-b) or gene-c) = [(gene-a,gene-b), gene-c]
Check if study org have orthologue with at least one sub part (gene-a, gene-b) or gene-c
if yes: add the reaction to the new sbml and change genes ids by study org genes ids
Create the new sbml file.
::
usage:
extract_orthofinder --sbml=FILE/DIR --orthologues=DIR --study_id=STR --output=DIR [--workflow=STR] [-v]
extract_orthofinder --sbml=DIR --orthogroups=FILE --study_id=STR --output=DIR [--workflow=STR] [-v]
option:
-h --help Show help.
--sbml=DIR Folder with sub folder named as models name within sbml file name as model_name.sbml
--orthogroups=FILE Output file of Orthofinder run Orthogroups.tsv
--orthologues=DIR Output directory of Orthofinder run Orthologues
--study_id=ID name of the studied organism
--workflow=ID worklow id in ['aureme','aucome']. specific run architecture where to search sbml files
--output=DIR folder where to create all sbml output files
-v print info
"""
import re
import csv
import libsbml
import docopt
from padmet.utils import sbmlPlugin as sp
from padmet.utils import gbr
import os
[docs]def main():
args = docopt.docopt(__doc__)
verbose = args["-v"]
sbml = args["--sbml"]
orthogroups_file = args["--orthogroups"]
orthologue_folder = args["--orthologues"]
output_folder = args["--output"]
study_id = args["--study_id"]
workflow = args["--workflow"]
if workflow: workflow = workflow.lower()
if workflow not in [None, 'aureme','aucome']:
raise ValueError('--worfklow must be in ["aureme","aucome"]')
all_model_sbml = get_sbml_files(sbml, workflow, verbose)
if orthogroups_file:
orthogroups_to_sbml(orthogroups_file, all_model_sbml, output_folder, study_id, verbose)
elif orthologue_folder:
orthologue_to_sbml(orthologue_folder, all_model_sbml, output_folder, study_id, verbose)
[docs]def get_sbml_files(sbml, workflow = None, verbose = False):
"""
"""
all_model_sbml = {}
if workflow == 'aureme':
orthology_based_folder = os.path.join(sbml, "orthology_based_reconstruction")
if not os.path.exists(orthology_based_folder):
raise ValueError("folder orthology_based_reconstruction not found in %s, should exist if workflow used is %s" %(sbml, workflow))
for path, _, files in os.walk(orthology_based_folder):
fold = os.path.basename(path)
if fold != "orthofinder_wd":
for _file in files:
if _file == fold+".sbml":
all_model_sbml[fold] = os.path.join(path, _file)
elif workflow == 'aucome':
annotation_based_sbml_folder = os.path.join(sbml, "annotation_based/SBMLs")
if not os.path.exists(annotation_based_sbml_folder):
raise ValueError("folder annotation_based/SBMLs not found in %s, should exist if workflow used is %s" %(sbml, workflow))
for _file in next(os.walk(annotation_based_sbml_folder))[2]:
org_id = re.sub("output_pathwaytools_|.sbml", "", _file)
all_model_sbml[org_id] = os.path.join(annotation_based_sbml_folder, _file)
model_organisms_folder = os.path.join(sbml, "model_organisms")
for path, _, files in os.walk(model_organisms_folder):
org_id = os.path.basename(path)
for _file in files:
if _file == org_id+".sbml":
all_model_sbml[org_id] = os.path.join(path, _file)
elif not os.path.isdir(sbml):
org_id = os.path.splitext(os.path.basename(sbml))[0]
all_model_sbml[org_id] = sbml
else:
for _file in next(os.walk(sbml))[2]:
org_id = os.path.splitext(_file)[0]
all_model_sbml[org_id] = os.path.join(sbml, _file)
return all_model_sbml
[docs]def orthogroups_to_sbml(orthogroups_file, all_model_sbml, output_folder, study_id, verbose = False):
"""
After running orthofinder on n fasta file, read the output file 'Orthogroups.csv'
Require a folder 'orthology_based_folder' with this archi:
\model_a
model_a.sbml
\model_b
model_b.sbml
And the name of the studied organism 'study_id'
1. Read the orthogroups file, extract orthogroups in dict 'all_orthogroups', and all org names
2. In orthology folder search for sbml files 'extension = .sbml'
3. For each models regroup all information in a dict dict_data:
{'study_id': study_id,
'model_id' : model_id,
'sbml_template': path to sbml of model',
'output': path to the output sbml,
'verbose': bool, if true print information
}
The output is by default: output_orthofinder_from_'model_id'.sbml
4. Store all previous dict_data in a list all_dict_data
5. iter on dict from all_dict_data and use function dict_data_to_sbml
This function will create a sbml from each model and conserve only reactions associated to ortholog genes
For more information read the doc of func dict_data_to_sbml
Parameters
----------
orthogroups_file: str
path of Orthofinder output file 'Orthogroups.csv'
orthology_based_folder: str
path of folder with model's sbml
output: str
pathname of the output folder of all sbml extracted
study_id: str
name of the studied organism
verbose: bool
if True print information
"""
if verbose:
print("Parsing Orthofinder output %s" %orthogroups_file)
#k=orthogroup_id, v = {k = name, v = set of genes}
dict_orthogroups = {}
all_orgs = set()
with open(orthogroups_file, 'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter = "\t")
for row in reader:
orth_id = row['Orthogroup']
row.pop('Orthogroup')
new_dict = dict([(name, set([gene_id.split("_isoform")[0] for gene_id in set(genes.split(","))])) for (name, genes) in list(row.items()) if genes])
dict_orthogroups[orth_id] = new_dict
all_orgs.update(new_dict.keys())
#check all sbml
if verbose:
print("Start sbml creation...")
all_dict_data = []
if not os.path.exists(output_folder):
if verbose:
print("\tCreating folder %s" %output_folder)
os.makedirs(output_folder)
all_models = all_orgs - set([study_id])
for model_id in all_models:
output_name = "output_orthofinder_from_{0}.sbml".format(model_id)
output = os.path.join(output_folder, output_name)
sbml_template = all_model_sbml[model_id]
dict_data = {'study_id': study_id, 'model_id': model_id, 'sbml_template': sbml_template, 'output': output, 'verbose':verbose}
all_dict_data.append(dict_data)
for dict_data in all_dict_data:
dict_data_to_sbml(dict_data, dict_orthogroups=dict_orthogroups)
[docs]def orthologue_to_sbml(orthologue_folder, all_model_sbml, output_folder, study_id, verbose = False):
"""
After running orthofinder on n fasta file, read the output file 'Orthogroups.csv'
Require a folder 'orthology_based_folder' with this archi:
\model_a
model_a.sbml
\model_b
model_b.sbml
And the name of the studied organism 'study_id'
1. Read the orthogroups file, extract orthogroups in dict 'all_orthogroups', and all org names
2. In orthology folder search for sbml files 'extension = .sbml'
3. For each models regroup all information in a dict dict_data:
{'study_id': study_id,
'model_id' : model_id,
'sbml_template': path to sbml of model',
'output': path to the output sbml,
'verbose': bool, if true print information
}
The output is by default: output_orthofinder_from_'model_id'.sbml
4. Store all previous dict_data in a list all_dict_data
5. iter on dict from all_dict_data and use function dict_data_to_sbml
This function will create a sbml from each model and conserve only reactions associated to ortholog genes
For more information read the doc of func dict_data_to_sbml
Parameters
----------
orthogroups_file: str
path of Orthofinder output file 'Orthogroups.csv'
orthology_based_folder: str
path of folder with model's sbml
output: str
pathname of the output folder of all sbml extracted
study_id: str
name of the studied organism
verbose: bool
if True print information
"""
if verbose:
print("Parsing Orthofinder output %s" %orthologue_folder)
#k=orthogroup_id, v = {k = name, v = set of genes}
all_orgs = set()
all_orthologue_files = []
for _path, _folders, _files in os.walk(orthologue_folder):
for _folder in _folders:
all_orgs.add(_folder.replace("Orthologues_",""))
for _file in _files:
all_orthologue_files.append(os.path.join(_path,_file))
dict_orthologues = {}
for org in all_orgs:
dict_orthologues[org] = dict()
#dict_orthologue: k = org_id, v = dict: k = gene_id, v = dict: k = org_id, v = set of gene orthologue id
for orthologue_file in [i for i in all_orthologue_files if "__v__" in i]:
with open(orthologue_file, 'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter = "\t")
orgs = list(reader.fieldnames)
orgs.remove('Orthogroup')
org_A, org_B = orgs
for row in reader:
gene_ids_A = [gene_id.split("_isoform")[0] for gene_id in row[org_A].split(", ")]
gene_ids_B = [gene_id.split("_isoform")[0] for gene_id in row[org_B].split(", ")]
for gene_id_A in gene_ids_A:
if gene_id_A not in dict_orthologues[org_A].keys():
dict_orthologues[org_A][gene_id_A] = dict()
try:
dict_orthologues[org_A][gene_id_A][org_B] = set(gene_ids_B)
except KeyError:
dict_orthologues[org_A][gene_id_A] = {org_B:set(gene_ids_B)}
"""
for gene_id_B in gene_ids_B:
if gene_id_B not in dict_orthologue[org_B].keys():
dict_orthologue[org_B][gene_id_B] = dict()
try:
dict_orthologue[org_B][gene_id_B][org_A] = set(gene_ids_A)
except KeyError:
dict_orthologue[org_B][gene_id_B] = {org_A:set(gene_ids_A)}
"""
if verbose:
print("Start sbml creation...")
all_dict_data = []
if not os.path.exists(output_folder):
if verbose:
print("\tCreating folder %s" %output_folder)
os.makedirs(output_folder)
all_models = all_orgs - set([study_id])
for model_id in all_models:
output_name = "output_orthofinder_from_{0}.sbml".format(model_id)
output = os.path.join(output_folder, output_name)
sbml_template = all_model_sbml[model_id]
dict_data = {'study_id': study_id, 'model_id': model_id, 'sbml_template': sbml_template, 'output': output, 'verbose':verbose}
all_dict_data.append(dict_data)
for dict_data in all_dict_data:
dict_data_to_sbml(dict_data, dict_orthologues=dict_orthologues)
[docs]def dict_data_to_sbml(dict_data, dict_orthogroups=None, dict_orthologues=None, strict_match=True):
"""
Use a dict of data dict_data and dict of orthogroups dict_orthogroup to create sbml files.
dict_data and dict_orthogroup are obtained with fun orthofinder_to_sbml
1./ Read dict_orthogroups and check if model associated to dict_data and study org share orthologue
2./ Read sbml of model, parse all reactions and get genes associated to reaction.
3./ For each reactions:
Parse genes associated to sub part (ex: (gene-a and gene-b) or gene-c) = [(gene-a,gene-b), gene-c]
Check if study org have orthologue with at least one sub part (gene-a, gene-b) or gene-c
if yes: add the reaction to the new sbml and change genes ids by study org genes ids
4./ Create the new sbml file.
Parameters
----------
dict_data: dict
{'study_id': study_id,
'model_id' : model_id,
'sbml_template': path to sbml of model',
'output': path to the output sbml,
'verbose': bool, if true print information
}
dict_orthogroup: dict
k=orthogroup_id, v = {k = name, v = set of genes}
verbose: bool
if True print information
"""
#dict_data = {'study_name':'', 'o_compare_name': '', sbml_template':'', 'output':''}
study_id = dict_data['study_id']
model_id = dict_data['model_id']
sbml_template = dict_data['sbml_template']
output = dict_data['output']
verbose = dict_data.get('verbose')
if dict_orthogroups:
if verbose:
print("*Extracting orthogroups data to create sbml of {0} from {1}".format(study_id, model_id))
#k = gene_id from to_compare, v = list of genes id of study
sub_dict_orth = {}
for k in dict_orthogroups.values():
try:
all_to_compare_genes = k[model_id]
all_study_genes = k[study_id]
for to_compare_gene in all_to_compare_genes:
try:
sub_dict_orth[to_compare_gene].update(all_study_genes)
except KeyError:
sub_dict_orth[to_compare_gene] = set(all_study_genes)
except KeyError:
pass
if not sub_dict_orth:
if verbose:
print("\t{0} and {1} don't share any ortholgue".format(study_id, model_id))
return
elif dict_orthologues:
if verbose:
print("*Extracting orthologues data to create sbml of {0} from {1}".format(study_id, model_id))
#k = gene_id from to_compare, v = list of genes id of study
sub_dict_orth = {}
for gene_id, gene_dict in dict_orthologues[model_id].items():
try:
sub_dict_orth[gene_id] = gene_dict[study_id]
except KeyError:
pass
if not sub_dict_orth:
if verbose:
print("\t{0} and {1} don't share any ortholgue".format(study_id, model_id))
return
else:
ValueError("Must give one dict of orthogroups or orthologue")
reader = libsbml.SBMLReader()
document_to_compare = reader.readSBML(sbml_template)
for i in range(document_to_compare.getNumErrors()):
print(document_to_compare.getError(i).getMessage())
model_to_compare = document_to_compare.getModel()
listOfReactions_with_genes = [rxn for rxn in model_to_compare.getListOfReactions()
if sp.parseNotes(rxn).get("GENE_ASSOCIATION",[None])[0]]
if verbose:
print("\tSbml of {0} contains {1}/{2} reactions with genes assocation".format(model_id, len(listOfReactions_with_genes), len(model_to_compare.getListOfReactions())))
dict_rxn_ga = {}
for rxn in listOfReactions_with_genes:
ga = sp.parseNotes(rxn)['GENE_ASSOCIATION'][0]
ga_for_gbr = re.sub(r" or " , "|", ga)
ga_for_gbr = re.sub(r" and " , "&", ga_for_gbr)
ga_for_gbr = re.sub(r"\s" , "", ga_for_gbr)
if re.findall("\||&", ga_for_gbr):
to_compare_ga_subsets = list(gbr.compile_input(ga_for_gbr))
else:
ga_for_gbr = re.sub(r"\(|\)" , "", ga_for_gbr)
to_compare_ga_subsets = list([ga_for_gbr])
study_ga_subsets = []
"""
to_compare_ga_subsets = [('a','c','d'),('c',)]
sub_dict_orth = {'a':['a_a'],'c':['c_c'], 'd':['d_d']}
"""
for to_compare_subset in to_compare_ga_subsets:
study_subset = set()
for gene in to_compare_subset:
if gene in list(sub_dict_orth.keys()):
study_subset.update(sub_dict_orth[gene])
else:
study_subset = set()
break
if study_subset:
"""
if verbose:
print("\t\t{0} == {1}".format(tuple(to_compare_subset), tuple(study_subset)))
"""
study_ga_subsets.append(study_subset)
if study_ga_subsets:
study_ga = " or ".join(["("+" and ".join(subset)+")" for subset in study_ga_subsets])
if verbose:
print("\t\tAdding %s" %rxn.id)
print("\t\tGENE_ASSOCIATION: %s" %(study_ga))
dict_rxn_ga[rxn.id] = study_ga
if not dict_rxn_ga:
if verbose:
print("\tNo reaction added from {0} to {1} because of missing orthologues".format(model_id, study_id))
return
rxn_id_to_remove = set([rxn.id for rxn in model_to_compare.getListOfReactions()]).difference(list(dict_rxn_ga.keys()))
if verbose:
print("\tRemoving %s unused reactions" %len(rxn_id_to_remove))
[model_to_compare.removeReaction(rxn_id) for rxn_id in rxn_id_to_remove]
cpd_id_to_preserve = set()
for rxn_id, study_ga in list(dict_rxn_ga.items()):
rxn = model_to_compare.getElementBySId(rxn_id)
#update notes
notes_in_dict = sp.parseNotes(rxn)
notes_in_dict["GENE_ASSOCIATION"] = [study_ga]
notes = "<body xmlns=\"http://www.w3.org/1999/xhtml\">"
for k,v_list in list(notes_in_dict.items()):
for v in v_list:
notes += "<p>"+k+": "+v+"</p>"
notes += "</body>"
rxn.setNotes(notes)
cpd_in_rxn = set([p.getSpecies() for p in rxn.getListOfProducts()]).union(\
set([r.getSpecies() for r in rxn.getListOfReactants()]))
cpd_id_to_preserve.update(cpd_in_rxn)
all_species = [cpd.id for cpd in model_to_compare.getListOfSpecies()]
[model_to_compare.removeSpecies(cpd_id ) for cpd_id in all_species if cpd_id not in cpd_id_to_preserve]
new_id = os.path.basename(os.path.splitext(output)[0])
model_to_compare.setId(new_id)
libsbml.writeSBMLToFile(document_to_compare, output)
if __name__ == "__main__":
main()