Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gene-level alignment codes and example notebook 5c #24

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions examples/5c_functional_enrichments.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Base imports\n",
"import os\n",
"import pickle\n",
"\n",
"# Compute imports\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scipy\n",
"from tqdm.notebook import tqdm, trange\n",
"\n",
"# Plotting imports\n",
"import matplotlib\n",
"matplotlib.rcParams['pdf.fonttype'] = 42\n",
"from matplotlib import pyplot as plt\n",
"import seaborn as sns\n",
"from plotly import express as px\n",
"\n",
"# ML import\n",
"from sklearn.decomposition import NMF\n",
"from sklearn.metrics import mean_squared_error, median_absolute_error\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from pyphylon.util import load_config"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = load_config(\"config.yml\")\n",
"WORKDIR = CONFIG[\"WORKDIR\"]\n",
"SPECIES = CONFIG[\"PG_NAME\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"L_BIN = os.path.join(WORKDIR, f'processed/nmf-outputs/L_binarized.csv')\n",
"L_BIN = pd.read_csv(L_BIN, index_col=0)\n",
"L_BIN.columns = [f'phylon{i}' for i in range(1, L_BIN.shape[1]+1)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# collection functions...\n",
"\n",
"from pyphylon.biointerp import collect_functions\n",
"# only run me once:\n",
"# all_functions = collect_functions(WORKDIR, 'processed/bakta/')\n",
"# all_functions.to_csv(os.path.join(WORKDIR, 'processed/all_functions.csv'))\n",
"\n",
"all_functions = pd.read_csv(os.path.join(WORKDIR, 'processed/all_functions.csv'), index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get the pan-genome\n",
"df_genes = pd.read_pickle(os.path.join(WORKDIR, f'processed/cd-hit-results/{SPECIES}_strain_by_gene.pickle.gz'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyphylon.biointerp import get_pg_to_locus_map\n",
"# Data wrangling to get the functions for each cluster \n",
"pg2locus_map = get_pg_to_locus_map(WORKDIR, SPECIES)\n",
"functions2genes = pd.merge(all_functions, pg2locus_map, left_on='locus', right_on='gene_id')\n",
"cluster_functions = functions2genes.groupby('cluster').first().reset_index()[['cluster','product','go']]\n",
"cluster_functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyphylon.biointerp import explode_go_annos\n",
"cluster_to_go_functions = explode_go_annos(cluster_functions)\n",
"cluster_to_go_functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"go_functions_count = cluster_to_go_functions.groupby('go').count()\n",
"go_functions = go_functions_count[go_functions_count['cluster'] > 3].sort_values('cluster', ascending=False)\n",
"go_functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#calculate a single engirchment\n",
"from pyphylon.biointerp import calc_enrichment\n",
"go_term = 'GO:0005524'\n",
"phylon = 'phylon1'\n",
"calc_enrichment(L_BIN, cluster_to_go_functions, go_term, functions2genes, phylon, phylon_contribution_cutoff=0) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyphylon.biointerp import calc_all_phylon_go_enrichments, get_go_mapping # TODO need to speed this up - shrinking functions2genes to only accessory genes seemed to help...\n",
"\n",
"phylon_go_enrichments = calc_all_phylon_go_enrichments(L_BIN, functions2genes, cluster_to_go_functions, go_functions, phylon_contribution_cutoff=0.5)\n",
"phylon_go_enrichments = phylon_go_enrichments[phylon_go_enrichments['p_value']<0.05]\n",
"\n",
"go_mapping = get_go_mapping()\n",
"phylon_go_enrichments = pd.merge(phylon_go_enrichments, go_mapping, left_on='function', right_index=True, how='left')\n",
"missing_go = phylon_go_enrichments[phylon_go_enrichments['name'].isnull()].index\n",
"phylon_go_enrichments.loc[missing_go, 'name'] = phylon_go_enrichments.loc[missing_go,'function']\n",
"\n",
"phylon_go_enrichments = phylon_go_enrichments[phylon_go_enrichments['function']!='SO:0001217'] # filter out SO:0001217 is just a category for \"protein encoding gene\"\n",
"phylon_go_enrichments.to_csv(os.path.join(WORKDIR, 'processed/phylon_go_enrichment.csv'))\n",
"\n",
"phylon_go_enrichments = pd.read_csv(os.path.join(WORKDIR, 'processed/phylon_go_enrichment.csv'), index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"phylon_go_enrichments_mat = pd.pivot_table(phylon_go_enrichments, index='phylon', columns='function', values='p_value')\n",
"sns.clustermap(phylon_go_enrichments_mat.fillna(0.05), cmap='rocket_r')\n",
"plt.title('phylon functional enrichments')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Explore a single phylon:\n",
"phylon = 'phylon1'\n",
"phylon_go_enrichments[phylon_go_enrichments['phylon']==phylon][:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Explore a all phylons:\n",
"from pyphylon.biointerp import gen_phylon_wordcloud\n",
"for phylon in phylon_go_enrichments['phylon'].unique():\n",
" phylon_enr = phylon_go_enrichments[phylon_go_enrichments['phylon']==phylon]\n",
" phylon_enr.loc[:,'products'] = phylon_enr['products'].str.replace(';', '<br>')\n",
" fig = px.scatter(phylon_enr, x='overlap', y='logp', text='name', size='overlap', hover_data='products')\n",
" print(phylon)\n",
" fig.show()\n",
" gen_phylon_wordcloud(L_BIN, functions2genes, phylon, cutoff=0)\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "pyphylon",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading